2025-07-16 14:07:30 -04:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
import json
|
|
|
|
import logging
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
import os
|
|
|
|
from datetime import datetime
|
2025-07-16 14:07:30 -04:00
|
|
|
from pathlib import Path
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
from typing import Any, Dict, List, Literal, Optional
|
2025-07-16 14:07:30 -04:00
|
|
|
|
|
|
|
from langchain_core.messages import HumanMessage, SystemMessage
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
from langgraph.prebuilt import create_react_agent
|
|
|
|
from pydantic import BaseModel, Field, field_validator
|
2025-07-16 14:07:30 -04:00
|
|
|
|
|
|
|
from .agent_state import AgentState
|
fix(SRE Agent)- Deploy SRE Agent on Amazon Bedrock AgentCore Runtime with Enhanced Architecture (#158)
* feat: Deploy SRE agent on Amazon Bedrock AgentCore Runtime
- Add agent_runtime.py with FastAPI endpoints for AgentCore compatibility
- Create Dockerfile for ARM64-based containerization
- Add deployment scripts for automated ECR push and AgentCore deployment
- Update backend API URLs from placeholders to actual endpoints
- Update gateway configuration for production use
- Add dependencies for AgentCore runtime support
Implements #143
* chore: Add deployment artifacts to .gitignore
- Add deployment/.sre_agent_uri, deployment/.env, and deployment/.agent_arn to .gitignore
- Remove already tracked deployment artifacts from git
* feat: Make ANTHROPIC_API_KEY optional in deployment
- Update deploy_agent_runtime.py to conditionally include ANTHROPIC_API_KEY
- Show info message when using Amazon Bedrock as provider
- Update .env.example to clarify ANTHROPIC_API_KEY is optional
- Only include ANTHROPIC_API_KEY in environment variables if it exists
* fix: Use uv run python instead of python in build script
- Update build_and_deploy.sh to use 'uv run python' for deployment
- Change to parent directory to ensure uv environment is available
- Fixes 'python: command not found' error during deployment
* refactor: Improve deployment script structure and create .env symlink
- Flatten nested if-else blocks in deploy_agent_runtime.py for better readability
- Add 10-second sleep after deletion to ensure cleanup completes
- Create symlink from deployment/.env to sre_agent/.env to avoid duplication
- Move time import to top of file with other imports
* feat: Add debug mode support and comprehensive deployment guide
Add --debug command line flag and DEBUG environment variable support:
- Created shared logging configuration module
- Updated CLI and runtime to support --debug flag
- Made debug traces conditional on DEBUG environment variable
- Added debug mode for container and AgentCore deployments
Enhanced build and deployment script:
- Added command line argument for ECR repository name
- Added help documentation and usage examples
- Added support for local builds (x86_64) vs AgentCore builds (arm64)
- Added environment variable pass-through for DEBUG, LLM_PROVIDER, ANTHROPIC_API_KEY
Created comprehensive deployment guide:
- Step-by-step instructions from local testing to production
- Docker platform documentation (x86_64 vs arm64)
- Environment variable configuration with .env file usage
- Debug mode examples and troubleshooting guide
- Provider configuration for Bedrock and Anthropic
Updated README with AgentCore Runtime deployment section and documentation links.
* docs: Update SRE Agent README with deployment flow diagram and fix directory reference
- Fix reference from 04-SRE-agent to SRE-agent in README
- Add comprehensive flowchart showing development to production deployment flow
- Update overview to mention Amazon Bedrock AgentCore Runtime deployment
- Remove emojis from documentation for professional appearance
* docs: Replace mermaid diagram with ASCII step-by-step flow diagram
- Change from block-style mermaid diagram to ASCII flow diagram
- Show clear step-by-step progression from development to production
- Improve readability with structured boxes and arrows
- Minor text improvements for clarity
* feat: Implement comprehensive prompt management system and enhance deployment guide
- Create centralized prompt template system with external files in config/prompts/
- Add PromptLoader utility class with LRU caching and template variable substitution
- Integrate PromptConfig into SREConstants for centralized configuration management
- Update all agents (nodes, supervisor, output_formatter) to use prompt loader
- Replace 150+ lines of hardcoded prompts with modular, maintainable template system
- Enhance deployment guide with consistent naming (my_custom_sre_agent) throughout
- Add quick-start copy-paste command sequence for streamlined deployment
- Improve constants system with comprehensive model, AWS, timeout, and prompt configs
- Add architectural assessment document to .gitignore for local analysis
- Run black formatting across all updated Python files
* docs: Consolidate deployment and security documentation
- Rename deployment-and-security.md to security.md and remove redundant deployment content
- Enhance security.md with comprehensive production security guidelines including:
- Authentication and authorization best practices
- Encryption and data protection requirements
- Operational security monitoring and logging
- Input validation and prompt security measures
- Infrastructure security recommendations
- Compliance and governance frameworks
- Update README.md to reference new security.md file
- Eliminate redundancy between deployment-guide.md and deployment-and-security.md
- Improve documentation organization with clear separation of concerns
* config: Replace hardcoded endpoints with placeholder domains
- Update OpenAPI specifications to use placeholder domain 'your-backend-domain.com'
- k8s_api.yaml: mcpgateway.ddns.net:8011 -> your-backend-domain.com:8011
- logs_api.yaml: mcpgateway.ddns.net:8012 -> your-backend-domain.com:8012
- metrics_api.yaml: mcpgateway.ddns.net:8013 -> your-backend-domain.com:8013
- runbooks_api.yaml: mcpgateway.ddns.net:8014 -> your-backend-domain.com:8014
- Update agent configuration to use placeholder AgentCore gateway endpoint
- agent_config.yaml: Replace specific gateway ID with 'your-agentcore-gateway-endpoint'
- Improve security by removing hardcoded production endpoints from repository
- Enable template-based configuration that users can customize during setup
- Align with existing documentation patterns for placeholder domain replacement
2025-07-27 15:05:03 -04:00
|
|
|
from .constants import SREConstants
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
from .llm_utils import create_llm_with_error_handling
|
|
|
|
from .memory import create_conversation_memory_manager
|
|
|
|
from .memory.client import SREMemoryClient
|
|
|
|
from .memory.config import _load_memory_config
|
|
|
|
from .memory.hooks import MemoryHookProvider
|
|
|
|
from .memory.tools import create_memory_tools
|
2025-07-16 14:07:30 -04:00
|
|
|
from .output_formatter import create_formatter
|
fix(SRE Agent)- Deploy SRE Agent on Amazon Bedrock AgentCore Runtime with Enhanced Architecture (#158)
* feat: Deploy SRE agent on Amazon Bedrock AgentCore Runtime
- Add agent_runtime.py with FastAPI endpoints for AgentCore compatibility
- Create Dockerfile for ARM64-based containerization
- Add deployment scripts for automated ECR push and AgentCore deployment
- Update backend API URLs from placeholders to actual endpoints
- Update gateway configuration for production use
- Add dependencies for AgentCore runtime support
Implements #143
* chore: Add deployment artifacts to .gitignore
- Add deployment/.sre_agent_uri, deployment/.env, and deployment/.agent_arn to .gitignore
- Remove already tracked deployment artifacts from git
* feat: Make ANTHROPIC_API_KEY optional in deployment
- Update deploy_agent_runtime.py to conditionally include ANTHROPIC_API_KEY
- Show info message when using Amazon Bedrock as provider
- Update .env.example to clarify ANTHROPIC_API_KEY is optional
- Only include ANTHROPIC_API_KEY in environment variables if it exists
* fix: Use uv run python instead of python in build script
- Update build_and_deploy.sh to use 'uv run python' for deployment
- Change to parent directory to ensure uv environment is available
- Fixes 'python: command not found' error during deployment
* refactor: Improve deployment script structure and create .env symlink
- Flatten nested if-else blocks in deploy_agent_runtime.py for better readability
- Add 10-second sleep after deletion to ensure cleanup completes
- Create symlink from deployment/.env to sre_agent/.env to avoid duplication
- Move time import to top of file with other imports
* feat: Add debug mode support and comprehensive deployment guide
Add --debug command line flag and DEBUG environment variable support:
- Created shared logging configuration module
- Updated CLI and runtime to support --debug flag
- Made debug traces conditional on DEBUG environment variable
- Added debug mode for container and AgentCore deployments
Enhanced build and deployment script:
- Added command line argument for ECR repository name
- Added help documentation and usage examples
- Added support for local builds (x86_64) vs AgentCore builds (arm64)
- Added environment variable pass-through for DEBUG, LLM_PROVIDER, ANTHROPIC_API_KEY
Created comprehensive deployment guide:
- Step-by-step instructions from local testing to production
- Docker platform documentation (x86_64 vs arm64)
- Environment variable configuration with .env file usage
- Debug mode examples and troubleshooting guide
- Provider configuration for Bedrock and Anthropic
Updated README with AgentCore Runtime deployment section and documentation links.
* docs: Update SRE Agent README with deployment flow diagram and fix directory reference
- Fix reference from 04-SRE-agent to SRE-agent in README
- Add comprehensive flowchart showing development to production deployment flow
- Update overview to mention Amazon Bedrock AgentCore Runtime deployment
- Remove emojis from documentation for professional appearance
* docs: Replace mermaid diagram with ASCII step-by-step flow diagram
- Change from block-style mermaid diagram to ASCII flow diagram
- Show clear step-by-step progression from development to production
- Improve readability with structured boxes and arrows
- Minor text improvements for clarity
* feat: Implement comprehensive prompt management system and enhance deployment guide
- Create centralized prompt template system with external files in config/prompts/
- Add PromptLoader utility class with LRU caching and template variable substitution
- Integrate PromptConfig into SREConstants for centralized configuration management
- Update all agents (nodes, supervisor, output_formatter) to use prompt loader
- Replace 150+ lines of hardcoded prompts with modular, maintainable template system
- Enhance deployment guide with consistent naming (my_custom_sre_agent) throughout
- Add quick-start copy-paste command sequence for streamlined deployment
- Improve constants system with comprehensive model, AWS, timeout, and prompt configs
- Add architectural assessment document to .gitignore for local analysis
- Run black formatting across all updated Python files
* docs: Consolidate deployment and security documentation
- Rename deployment-and-security.md to security.md and remove redundant deployment content
- Enhance security.md with comprehensive production security guidelines including:
- Authentication and authorization best practices
- Encryption and data protection requirements
- Operational security monitoring and logging
- Input validation and prompt security measures
- Infrastructure security recommendations
- Compliance and governance frameworks
- Update README.md to reference new security.md file
- Eliminate redundancy between deployment-guide.md and deployment-and-security.md
- Improve documentation organization with clear separation of concerns
* config: Replace hardcoded endpoints with placeholder domains
- Update OpenAPI specifications to use placeholder domain 'your-backend-domain.com'
- k8s_api.yaml: mcpgateway.ddns.net:8011 -> your-backend-domain.com:8011
- logs_api.yaml: mcpgateway.ddns.net:8012 -> your-backend-domain.com:8012
- metrics_api.yaml: mcpgateway.ddns.net:8013 -> your-backend-domain.com:8013
- runbooks_api.yaml: mcpgateway.ddns.net:8014 -> your-backend-domain.com:8014
- Update agent configuration to use placeholder AgentCore gateway endpoint
- agent_config.yaml: Replace specific gateway ID with 'your-agentcore-gateway-endpoint'
- Improve security by removing hardcoded production endpoints from repository
- Enable template-based configuration that users can customize during setup
- Align with existing documentation patterns for placeholder domain replacement
2025-07-27 15:05:03 -04:00
|
|
|
from .prompt_loader import prompt_loader
|
2025-07-16 14:07:30 -04:00
|
|
|
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
|
|
|
|
def _get_user_from_env() -> str:
|
|
|
|
"""Get user_id from environment variable.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
user_id from USER_ID environment variable or default
|
|
|
|
"""
|
|
|
|
user_id = os.getenv("USER_ID")
|
|
|
|
if user_id:
|
|
|
|
logger.info(f"Using user_id from environment: {user_id}")
|
|
|
|
return user_id
|
|
|
|
else:
|
|
|
|
# Fallback to default user_id
|
|
|
|
default_user_id = SREConstants.agents.default_user_id
|
|
|
|
logger.warning(
|
|
|
|
f"USER_ID not set in environment, using default: {default_user_id}"
|
|
|
|
)
|
|
|
|
return default_user_id
|
|
|
|
|
|
|
|
|
|
|
|
def _get_session_from_env(mode: str) -> str:
|
|
|
|
"""Get session_id from environment variable or generate one.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
mode: "interactive" or "prompt" for auto-generation prefix
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
session_id from SESSION_ID environment variable or auto-generated
|
|
|
|
"""
|
|
|
|
session_id = os.getenv("SESSION_ID")
|
|
|
|
if session_id:
|
|
|
|
logger.info(f"Using session_id from environment: {session_id}")
|
|
|
|
return session_id
|
|
|
|
else:
|
|
|
|
# Auto-generate session_id
|
|
|
|
auto_session_id = f"{mode}-{datetime.now().strftime('%Y%m%d%H%M%S')}"
|
|
|
|
logger.info(
|
|
|
|
f"SESSION_ID not set in environment, auto-generated: {auto_session_id}"
|
|
|
|
)
|
|
|
|
return auto_session_id
|
|
|
|
|
|
|
|
|
2025-07-16 14:07:30 -04:00
|
|
|
# Configure logging with basicConfig
|
|
|
|
logging.basicConfig(
|
|
|
|
level=logging.INFO, # Set the log level to INFO
|
|
|
|
# Define log message format
|
|
|
|
format="%(asctime)s,p%(process)s,{%(filename)s:%(lineno)d},%(levelname)s,%(message)s",
|
|
|
|
)
|
|
|
|
|
fix(SRE Agent)- Deploy SRE Agent on Amazon Bedrock AgentCore Runtime with Enhanced Architecture (#158)
* feat: Deploy SRE agent on Amazon Bedrock AgentCore Runtime
- Add agent_runtime.py with FastAPI endpoints for AgentCore compatibility
- Create Dockerfile for ARM64-based containerization
- Add deployment scripts for automated ECR push and AgentCore deployment
- Update backend API URLs from placeholders to actual endpoints
- Update gateway configuration for production use
- Add dependencies for AgentCore runtime support
Implements #143
* chore: Add deployment artifacts to .gitignore
- Add deployment/.sre_agent_uri, deployment/.env, and deployment/.agent_arn to .gitignore
- Remove already tracked deployment artifacts from git
* feat: Make ANTHROPIC_API_KEY optional in deployment
- Update deploy_agent_runtime.py to conditionally include ANTHROPIC_API_KEY
- Show info message when using Amazon Bedrock as provider
- Update .env.example to clarify ANTHROPIC_API_KEY is optional
- Only include ANTHROPIC_API_KEY in environment variables if it exists
* fix: Use uv run python instead of python in build script
- Update build_and_deploy.sh to use 'uv run python' for deployment
- Change to parent directory to ensure uv environment is available
- Fixes 'python: command not found' error during deployment
* refactor: Improve deployment script structure and create .env symlink
- Flatten nested if-else blocks in deploy_agent_runtime.py for better readability
- Add 10-second sleep after deletion to ensure cleanup completes
- Create symlink from deployment/.env to sre_agent/.env to avoid duplication
- Move time import to top of file with other imports
* feat: Add debug mode support and comprehensive deployment guide
Add --debug command line flag and DEBUG environment variable support:
- Created shared logging configuration module
- Updated CLI and runtime to support --debug flag
- Made debug traces conditional on DEBUG environment variable
- Added debug mode for container and AgentCore deployments
Enhanced build and deployment script:
- Added command line argument for ECR repository name
- Added help documentation and usage examples
- Added support for local builds (x86_64) vs AgentCore builds (arm64)
- Added environment variable pass-through for DEBUG, LLM_PROVIDER, ANTHROPIC_API_KEY
Created comprehensive deployment guide:
- Step-by-step instructions from local testing to production
- Docker platform documentation (x86_64 vs arm64)
- Environment variable configuration with .env file usage
- Debug mode examples and troubleshooting guide
- Provider configuration for Bedrock and Anthropic
Updated README with AgentCore Runtime deployment section and documentation links.
* docs: Update SRE Agent README with deployment flow diagram and fix directory reference
- Fix reference from 04-SRE-agent to SRE-agent in README
- Add comprehensive flowchart showing development to production deployment flow
- Update overview to mention Amazon Bedrock AgentCore Runtime deployment
- Remove emojis from documentation for professional appearance
* docs: Replace mermaid diagram with ASCII step-by-step flow diagram
- Change from block-style mermaid diagram to ASCII flow diagram
- Show clear step-by-step progression from development to production
- Improve readability with structured boxes and arrows
- Minor text improvements for clarity
* feat: Implement comprehensive prompt management system and enhance deployment guide
- Create centralized prompt template system with external files in config/prompts/
- Add PromptLoader utility class with LRU caching and template variable substitution
- Integrate PromptConfig into SREConstants for centralized configuration management
- Update all agents (nodes, supervisor, output_formatter) to use prompt loader
- Replace 150+ lines of hardcoded prompts with modular, maintainable template system
- Enhance deployment guide with consistent naming (my_custom_sre_agent) throughout
- Add quick-start copy-paste command sequence for streamlined deployment
- Improve constants system with comprehensive model, AWS, timeout, and prompt configs
- Add architectural assessment document to .gitignore for local analysis
- Run black formatting across all updated Python files
* docs: Consolidate deployment and security documentation
- Rename deployment-and-security.md to security.md and remove redundant deployment content
- Enhance security.md with comprehensive production security guidelines including:
- Authentication and authorization best practices
- Encryption and data protection requirements
- Operational security monitoring and logging
- Input validation and prompt security measures
- Infrastructure security recommendations
- Compliance and governance frameworks
- Update README.md to reference new security.md file
- Eliminate redundancy between deployment-guide.md and deployment-and-security.md
- Improve documentation organization with clear separation of concerns
* config: Replace hardcoded endpoints with placeholder domains
- Update OpenAPI specifications to use placeholder domain 'your-backend-domain.com'
- k8s_api.yaml: mcpgateway.ddns.net:8011 -> your-backend-domain.com:8011
- logs_api.yaml: mcpgateway.ddns.net:8012 -> your-backend-domain.com:8012
- metrics_api.yaml: mcpgateway.ddns.net:8013 -> your-backend-domain.com:8013
- runbooks_api.yaml: mcpgateway.ddns.net:8014 -> your-backend-domain.com:8014
- Update agent configuration to use placeholder AgentCore gateway endpoint
- agent_config.yaml: Replace specific gateway ID with 'your-agentcore-gateway-endpoint'
- Improve security by removing hardcoded production endpoints from repository
- Enable template-based configuration that users can customize during setup
- Align with existing documentation patterns for placeholder domain replacement
2025-07-27 15:05:03 -04:00
|
|
|
# Enable HTTP and MCP protocol logs for debugging
|
|
|
|
# Comment out the following lines to suppress these logs if needed
|
|
|
|
# mcp_loggers = ["streamable_http", "mcp.client.streamable_http", "httpx", "httpcore"]
|
|
|
|
#
|
|
|
|
# for logger_name in mcp_loggers:
|
|
|
|
# mcp_logger = logging.getLogger(logger_name)
|
|
|
|
# mcp_logger.setLevel(logging.WARNING)
|
2025-07-16 14:07:30 -04:00
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
def _json_serializer(obj):
|
|
|
|
"""JSON serializer for objects not serializable by default json code."""
|
|
|
|
if isinstance(obj, datetime):
|
|
|
|
return obj.isoformat()
|
|
|
|
raise TypeError(f"Object of type {obj.__class__.__name__} is not JSON serializable")
|
|
|
|
|
|
|
|
|
2025-07-16 14:07:30 -04:00
|
|
|
class InvestigationPlan(BaseModel):
|
|
|
|
"""Investigation plan created by supervisor."""
|
|
|
|
|
|
|
|
steps: List[str] = Field(
|
|
|
|
description="List of 3-5 investigation steps to be executed"
|
|
|
|
)
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
|
|
|
|
@field_validator("steps", mode="before")
|
|
|
|
@classmethod
|
|
|
|
def validate_steps(cls, v):
|
|
|
|
"""Convert string steps to list if needed."""
|
|
|
|
if isinstance(v, str):
|
|
|
|
# Split by numbered lines and clean up
|
|
|
|
import re
|
|
|
|
|
|
|
|
lines = v.strip().split("\n")
|
|
|
|
steps = []
|
|
|
|
for line in lines:
|
|
|
|
line = line.strip()
|
|
|
|
if line:
|
|
|
|
# Remove numbering like "1.", "2.", etc.
|
|
|
|
clean_line = re.sub(r"^\d+\.\s*", "", line)
|
|
|
|
if clean_line:
|
|
|
|
steps.append(clean_line)
|
|
|
|
return steps
|
|
|
|
return v
|
|
|
|
|
2025-07-16 14:07:30 -04:00
|
|
|
agents_sequence: List[str] = Field(
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
description="Sequence of agents to invoke (kubernetes_agent, logs_agent, metrics_agent, runbooks_agent)"
|
2025-07-16 14:07:30 -04:00
|
|
|
)
|
|
|
|
complexity: Literal["simple", "complex"] = Field(
|
|
|
|
description="Whether this plan is simple (auto-execute) or complex (needs approval)"
|
|
|
|
)
|
|
|
|
auto_execute: bool = Field(
|
|
|
|
description="Whether to execute automatically or ask for user approval"
|
|
|
|
)
|
|
|
|
reasoning: str = Field(
|
|
|
|
description="Brief explanation of the investigation approach"
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
class RouteDecision(BaseModel):
|
|
|
|
"""Decision made by supervisor for routing."""
|
|
|
|
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
next: Literal[
|
|
|
|
"kubernetes_agent", "logs_agent", "metrics_agent", "runbooks_agent", "FINISH"
|
|
|
|
] = Field(description="The next agent to route to, or FINISH if done")
|
2025-07-16 14:07:30 -04:00
|
|
|
reasoning: str = Field(
|
|
|
|
description="Brief explanation of why this routing decision was made"
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def _read_supervisor_prompt() -> str:
|
|
|
|
"""Read supervisor system prompt from file."""
|
|
|
|
try:
|
|
|
|
prompt_path = (
|
|
|
|
Path(__file__).parent
|
|
|
|
/ "config"
|
|
|
|
/ "prompts"
|
|
|
|
/ "supervisor_multi_agent_prompt.txt"
|
|
|
|
)
|
|
|
|
if prompt_path.exists():
|
|
|
|
return prompt_path.read_text().strip()
|
|
|
|
except Exception as e:
|
|
|
|
logger.warning(f"Could not read supervisor prompt file: {e}")
|
|
|
|
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
# Fallback to supervisor fallback prompt file
|
|
|
|
try:
|
|
|
|
fallback_path = (
|
|
|
|
Path(__file__).parent
|
|
|
|
/ "config"
|
|
|
|
/ "prompts"
|
|
|
|
/ "supervisor_fallback_prompt.txt"
|
|
|
|
)
|
|
|
|
if fallback_path.exists():
|
|
|
|
return fallback_path.read_text().strip()
|
|
|
|
except Exception as e:
|
|
|
|
logger.warning(f"Could not read supervisor fallback prompt file: {e}")
|
2025-07-16 14:07:30 -04:00
|
|
|
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
# Final hardcoded fallback if files not found
|
|
|
|
return (
|
|
|
|
"You are the Supervisor Agent orchestrating a team of specialized SRE agents."
|
|
|
|
)
|
2025-07-16 14:07:30 -04:00
|
|
|
|
|
|
|
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
def _read_planning_prompt() -> str:
|
|
|
|
"""Read planning prompt from file."""
|
|
|
|
try:
|
|
|
|
prompt_path = (
|
|
|
|
Path(__file__).parent
|
|
|
|
/ "config"
|
|
|
|
/ "prompts"
|
|
|
|
/ "supervisor_planning_prompt.txt"
|
|
|
|
)
|
|
|
|
if prompt_path.exists():
|
|
|
|
return prompt_path.read_text().strip()
|
|
|
|
except Exception as e:
|
|
|
|
logger.warning(f"Could not read planning prompt file: {e}")
|
2025-07-16 14:07:30 -04:00
|
|
|
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
# Fallback planning prompt
|
|
|
|
return """Create a simple, focused investigation plan with 2-3 steps maximum.
|
|
|
|
Create the plan in JSON format with these fields:
|
|
|
|
- steps: List of 3-5 investigation steps
|
|
|
|
- agents_sequence: List of agents to invoke (kubernetes_agent, logs_agent, metrics_agent, runbooks_agent)
|
|
|
|
- complexity: "simple" or "complex"
|
|
|
|
- auto_execute: true or false
|
|
|
|
- reasoning: Brief explanation of the investigation approach"""
|
2025-07-16 14:07:30 -04:00
|
|
|
|
|
|
|
|
|
|
|
class SupervisorAgent:
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
"""Supervisor agent that orchestrates other agents with memory capabilities."""
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
llm_provider: str = "bedrock",
|
|
|
|
force_delete_memory: bool = False,
|
|
|
|
**llm_kwargs,
|
|
|
|
):
|
2025-07-16 14:07:30 -04:00
|
|
|
self.llm_provider = llm_provider
|
|
|
|
self.llm = self._create_llm(**llm_kwargs)
|
|
|
|
self.system_prompt = _read_supervisor_prompt()
|
fix(SRE Agent)- Deploy SRE Agent on Amazon Bedrock AgentCore Runtime with Enhanced Architecture (#158)
* feat: Deploy SRE agent on Amazon Bedrock AgentCore Runtime
- Add agent_runtime.py with FastAPI endpoints for AgentCore compatibility
- Create Dockerfile for ARM64-based containerization
- Add deployment scripts for automated ECR push and AgentCore deployment
- Update backend API URLs from placeholders to actual endpoints
- Update gateway configuration for production use
- Add dependencies for AgentCore runtime support
Implements #143
* chore: Add deployment artifacts to .gitignore
- Add deployment/.sre_agent_uri, deployment/.env, and deployment/.agent_arn to .gitignore
- Remove already tracked deployment artifacts from git
* feat: Make ANTHROPIC_API_KEY optional in deployment
- Update deploy_agent_runtime.py to conditionally include ANTHROPIC_API_KEY
- Show info message when using Amazon Bedrock as provider
- Update .env.example to clarify ANTHROPIC_API_KEY is optional
- Only include ANTHROPIC_API_KEY in environment variables if it exists
* fix: Use uv run python instead of python in build script
- Update build_and_deploy.sh to use 'uv run python' for deployment
- Change to parent directory to ensure uv environment is available
- Fixes 'python: command not found' error during deployment
* refactor: Improve deployment script structure and create .env symlink
- Flatten nested if-else blocks in deploy_agent_runtime.py for better readability
- Add 10-second sleep after deletion to ensure cleanup completes
- Create symlink from deployment/.env to sre_agent/.env to avoid duplication
- Move time import to top of file with other imports
* feat: Add debug mode support and comprehensive deployment guide
Add --debug command line flag and DEBUG environment variable support:
- Created shared logging configuration module
- Updated CLI and runtime to support --debug flag
- Made debug traces conditional on DEBUG environment variable
- Added debug mode for container and AgentCore deployments
Enhanced build and deployment script:
- Added command line argument for ECR repository name
- Added help documentation and usage examples
- Added support for local builds (x86_64) vs AgentCore builds (arm64)
- Added environment variable pass-through for DEBUG, LLM_PROVIDER, ANTHROPIC_API_KEY
Created comprehensive deployment guide:
- Step-by-step instructions from local testing to production
- Docker platform documentation (x86_64 vs arm64)
- Environment variable configuration with .env file usage
- Debug mode examples and troubleshooting guide
- Provider configuration for Bedrock and Anthropic
Updated README with AgentCore Runtime deployment section and documentation links.
* docs: Update SRE Agent README with deployment flow diagram and fix directory reference
- Fix reference from 04-SRE-agent to SRE-agent in README
- Add comprehensive flowchart showing development to production deployment flow
- Update overview to mention Amazon Bedrock AgentCore Runtime deployment
- Remove emojis from documentation for professional appearance
* docs: Replace mermaid diagram with ASCII step-by-step flow diagram
- Change from block-style mermaid diagram to ASCII flow diagram
- Show clear step-by-step progression from development to production
- Improve readability with structured boxes and arrows
- Minor text improvements for clarity
* feat: Implement comprehensive prompt management system and enhance deployment guide
- Create centralized prompt template system with external files in config/prompts/
- Add PromptLoader utility class with LRU caching and template variable substitution
- Integrate PromptConfig into SREConstants for centralized configuration management
- Update all agents (nodes, supervisor, output_formatter) to use prompt loader
- Replace 150+ lines of hardcoded prompts with modular, maintainable template system
- Enhance deployment guide with consistent naming (my_custom_sre_agent) throughout
- Add quick-start copy-paste command sequence for streamlined deployment
- Improve constants system with comprehensive model, AWS, timeout, and prompt configs
- Add architectural assessment document to .gitignore for local analysis
- Run black formatting across all updated Python files
* docs: Consolidate deployment and security documentation
- Rename deployment-and-security.md to security.md and remove redundant deployment content
- Enhance security.md with comprehensive production security guidelines including:
- Authentication and authorization best practices
- Encryption and data protection requirements
- Operational security monitoring and logging
- Input validation and prompt security measures
- Infrastructure security recommendations
- Compliance and governance frameworks
- Update README.md to reference new security.md file
- Eliminate redundancy between deployment-guide.md and deployment-and-security.md
- Improve documentation organization with clear separation of concerns
* config: Replace hardcoded endpoints with placeholder domains
- Update OpenAPI specifications to use placeholder domain 'your-backend-domain.com'
- k8s_api.yaml: mcpgateway.ddns.net:8011 -> your-backend-domain.com:8011
- logs_api.yaml: mcpgateway.ddns.net:8012 -> your-backend-domain.com:8012
- metrics_api.yaml: mcpgateway.ddns.net:8013 -> your-backend-domain.com:8013
- runbooks_api.yaml: mcpgateway.ddns.net:8014 -> your-backend-domain.com:8014
- Update agent configuration to use placeholder AgentCore gateway endpoint
- agent_config.yaml: Replace specific gateway ID with 'your-agentcore-gateway-endpoint'
- Improve security by removing hardcoded production endpoints from repository
- Enable template-based configuration that users can customize during setup
- Align with existing documentation patterns for placeholder domain replacement
2025-07-27 15:05:03 -04:00
|
|
|
self.formatter = create_formatter(llm_provider=llm_provider)
|
2025-07-16 14:07:30 -04:00
|
|
|
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
# Initialize memory system
|
|
|
|
self.memory_config = _load_memory_config()
|
|
|
|
if self.memory_config.enabled:
|
2025-08-13 08:32:37 -04:00
|
|
|
# Use region from llm_kwargs if provided for bedrock
|
|
|
|
memory_region = llm_kwargs.get("region_name", self.memory_config.region) if llm_provider == "bedrock" else self.memory_config.region
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
self.memory_client = SREMemoryClient(
|
|
|
|
memory_name=self.memory_config.memory_name,
|
2025-08-13 08:32:37 -04:00
|
|
|
region=memory_region,
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
force_delete=force_delete_memory,
|
|
|
|
)
|
|
|
|
self.memory_hooks = MemoryHookProvider(self.memory_client)
|
|
|
|
self.conversation_manager = create_conversation_memory_manager(
|
|
|
|
self.memory_client
|
|
|
|
)
|
|
|
|
|
|
|
|
# Create memory tools for supervisor agent
|
|
|
|
self.memory_tools = create_memory_tools(self.memory_client)
|
|
|
|
|
|
|
|
# Create react agent with memory tools for supervised planning
|
|
|
|
self.planning_agent = create_react_agent(self.llm, self.memory_tools)
|
|
|
|
logger.info(
|
|
|
|
f"Memory system initialized for supervisor agent with {len(self.memory_tools)} memory tools"
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
self.memory_client = None
|
|
|
|
self.memory_hooks = None
|
|
|
|
self.conversation_manager = None
|
|
|
|
self.memory_tools = []
|
|
|
|
self.planning_agent = None
|
|
|
|
logger.info("Memory system disabled")
|
|
|
|
|
2025-07-16 14:07:30 -04:00
|
|
|
def _create_llm(self, **kwargs):
|
2025-08-01 13:24:58 -04:00
|
|
|
"""Create LLM instance with improved error handling."""
|
|
|
|
return create_llm_with_error_handling(self.llm_provider, **kwargs)
|
2025-07-16 14:07:30 -04:00
|
|
|
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
async def retrieve_memory(
|
|
|
|
self,
|
|
|
|
memory_type: str,
|
|
|
|
query: str,
|
|
|
|
actor_id: str,
|
|
|
|
max_results: int = 5,
|
|
|
|
session_id: Optional[str] = None,
|
|
|
|
) -> str:
|
|
|
|
"""Retrieve information from long-term memory using the retrieve_memory tool."""
|
|
|
|
if not self.memory_tools:
|
|
|
|
return "Memory system not enabled"
|
|
|
|
|
|
|
|
# Find the retrieve_memory tool
|
|
|
|
retrieve_tool = None
|
|
|
|
for tool in self.memory_tools:
|
|
|
|
if tool.name == "retrieve_memory":
|
|
|
|
retrieve_tool = tool
|
|
|
|
break
|
|
|
|
|
|
|
|
if not retrieve_tool:
|
|
|
|
return "Retrieve memory tool not available"
|
|
|
|
|
|
|
|
try:
|
|
|
|
logger.info(
|
|
|
|
f"Supervisor using retrieve_memory tool: type={memory_type}, query='{query}', actor_id={actor_id}"
|
|
|
|
)
|
|
|
|
result = retrieve_tool._run(
|
|
|
|
memory_type=memory_type,
|
|
|
|
query=query,
|
|
|
|
actor_id=actor_id,
|
|
|
|
max_results=max_results,
|
|
|
|
session_id=session_id,
|
|
|
|
)
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Error retrieving memory: {e}", exc_info=True)
|
|
|
|
return f"Error retrieving memory: {str(e)}"
|
|
|
|
|
2025-07-16 14:07:30 -04:00
|
|
|
async def create_investigation_plan(self, state: AgentState) -> InvestigationPlan:
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
"""Create an investigation plan for the user's query with memory context."""
|
2025-07-16 14:07:30 -04:00
|
|
|
current_query = state.get("current_query", "No query provided")
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
user_id = state.get("user_id", SREConstants.agents.default_user_id)
|
|
|
|
incident_id = state.get("incident_id")
|
2025-08-08 09:22:15 -04:00
|
|
|
|
|
|
|
# Update memory tools with the current user_id
|
|
|
|
if self.memory_tools:
|
|
|
|
from .memory.tools import update_memory_tools_user_id
|
|
|
|
|
|
|
|
update_memory_tools_user_id(self.memory_tools, user_id)
|
|
|
|
logger.info(f"Updated memory tools with user_id: {user_id}")
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
# Use user_id as actor_id for investigation memory retrieval (consistent with storage)
|
|
|
|
actor_id = state.get(
|
|
|
|
"user_id", state.get("actor_id", SREConstants.agents.default_actor_id)
|
|
|
|
)
|
|
|
|
session_id = state.get("session_id")
|
|
|
|
|
|
|
|
# Retrieve memory context if memory system is enabled
|
|
|
|
memory_context_text = ""
|
|
|
|
if self.memory_client:
|
|
|
|
try:
|
|
|
|
logger.info(
|
|
|
|
f"Retrieving memory context for user_id={user_id}, query='{current_query}'"
|
|
|
|
)
|
|
|
|
|
|
|
|
# Get memory context from hooks
|
|
|
|
if not session_id:
|
|
|
|
raise ValueError(
|
|
|
|
"session_id is required for memory retrieval but not found in state"
|
|
|
|
)
|
|
|
|
|
|
|
|
memory_context = self.memory_hooks.on_investigation_start(
|
|
|
|
query=current_query,
|
|
|
|
user_id=user_id,
|
|
|
|
actor_id=actor_id,
|
|
|
|
session_id=session_id,
|
|
|
|
incident_id=incident_id,
|
|
|
|
)
|
|
|
|
|
|
|
|
# Store memory context in state
|
|
|
|
state["memory_context"] = memory_context
|
|
|
|
|
|
|
|
# Log user preferences for debugging (they're stored in memory_context)
|
|
|
|
user_prefs = memory_context.get("user_preferences", [])
|
|
|
|
logger.debug(
|
|
|
|
f"Stored {len(user_prefs)} user preferences in memory_context during planning"
|
|
|
|
)
|
|
|
|
logger.debug(
|
|
|
|
f"User preferences being stored in memory_context: {user_prefs}"
|
|
|
|
)
|
|
|
|
|
|
|
|
# Format memory context for prompt
|
|
|
|
pref_count = len(memory_context.get("user_preferences", []))
|
|
|
|
infrastructure_by_agent = memory_context.get(
|
|
|
|
"infrastructure_by_agent", {}
|
|
|
|
)
|
|
|
|
total_knowledge = sum(
|
|
|
|
len(memories) for memories in infrastructure_by_agent.values()
|
|
|
|
)
|
|
|
|
investigation_count = len(memory_context.get("past_investigations", []))
|
|
|
|
|
|
|
|
if memory_context.get("user_preferences"):
|
|
|
|
memory_context_text += f"\nRelevant User Preferences:\n{json.dumps(memory_context['user_preferences'], indent=2, default=_json_serializer)}\n"
|
|
|
|
|
|
|
|
if infrastructure_by_agent:
|
|
|
|
memory_context_text += (
|
|
|
|
"\nRelevant Infrastructure Knowledge (organized by agent):\n"
|
|
|
|
)
|
|
|
|
for agent_id, agent_memories in infrastructure_by_agent.items():
|
|
|
|
memory_context_text += (
|
|
|
|
f"\n From {agent_id} ({len(agent_memories)} items):\n"
|
|
|
|
)
|
|
|
|
memory_context_text += f"{json.dumps(agent_memories, indent=4, default=_json_serializer)}\n"
|
|
|
|
|
|
|
|
if memory_context.get("past_investigations"):
|
|
|
|
memory_context_text += f"\nSimilar Past Investigations:\n{json.dumps(memory_context['past_investigations'], indent=2, default=_json_serializer)}\n"
|
|
|
|
|
|
|
|
logger.info(
|
|
|
|
f"Retrieved memory context for planning: {pref_count} preferences, {total_knowledge} knowledge items from {len(infrastructure_by_agent)} agents, {investigation_count} past investigations"
|
|
|
|
)
|
|
|
|
|
|
|
|
if pref_count + total_knowledge + investigation_count == 0:
|
|
|
|
logger.info(
|
|
|
|
"No relevant memories found - this may be the first interaction or a new topic"
|
|
|
|
)
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Failed to retrieve memory context: {e}", exc_info=True)
|
|
|
|
memory_context_text = ""
|
|
|
|
|
|
|
|
# Enhanced planning prompt that instructs the agent to use memory tools
|
|
|
|
planning_instructions = _read_planning_prompt()
|
|
|
|
# Replace placeholders manually to avoid issues with JSON braces in the prompt
|
|
|
|
formatted_planning_instructions = planning_instructions.replace(
|
|
|
|
"{user_id}", user_id
|
|
|
|
)
|
|
|
|
if session_id:
|
|
|
|
formatted_planning_instructions = formatted_planning_instructions.replace(
|
|
|
|
"{session_id}", session_id
|
|
|
|
)
|
2025-07-16 14:07:30 -04:00
|
|
|
|
|
|
|
planning_prompt = f"""{self.system_prompt}
|
|
|
|
|
|
|
|
User's query: {current_query}
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
{memory_context_text}
|
2025-07-16 14:07:30 -04:00
|
|
|
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
{formatted_planning_instructions}"""
|
2025-07-16 14:07:30 -04:00
|
|
|
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
if self.planning_agent and self.memory_tools:
|
|
|
|
# Use planning agent with memory tools
|
|
|
|
try:
|
|
|
|
# Create messages for the planning agent
|
|
|
|
messages = [
|
|
|
|
SystemMessage(content=planning_prompt),
|
|
|
|
HumanMessage(
|
|
|
|
content=f"Create an investigation plan for: {current_query}"
|
|
|
|
),
|
|
|
|
]
|
2025-07-16 14:07:30 -04:00
|
|
|
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
# Use the planning agent with memory tools
|
|
|
|
plan_response = await self.planning_agent.ainvoke(
|
|
|
|
{"messages": messages}
|
|
|
|
)
|
2025-07-16 14:07:30 -04:00
|
|
|
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
# Extract the final message content
|
|
|
|
if plan_response and "messages" in plan_response:
|
|
|
|
final_message = plan_response["messages"][-1]
|
|
|
|
plan_text = final_message.content
|
|
|
|
|
|
|
|
# Always log the complete planning agent response
|
|
|
|
logger.info(f"Planning agent original response: {plan_text}")
|
|
|
|
|
|
|
|
# Try to extract JSON from the response
|
|
|
|
import re
|
|
|
|
|
|
|
|
# Look for JSON in the response - try multiple patterns
|
|
|
|
json_patterns = [
|
|
|
|
r'\{[^{}]*"steps"[^{}]*"agents_sequence"[^{}]*"complexity"[^{}]*"auto_execute"[^{}]*"reasoning"[^{}]*\}', # Specific pattern for our structure
|
|
|
|
r'\{.*?"steps".*?\}', # Broader pattern
|
|
|
|
r"\{.*\}", # Most general pattern
|
|
|
|
]
|
|
|
|
|
|
|
|
json_content = None
|
|
|
|
for pattern in json_patterns:
|
|
|
|
json_match = re.search(pattern, plan_text, re.DOTALL)
|
|
|
|
if json_match:
|
|
|
|
json_content = json_match.group()
|
|
|
|
logger.info(
|
|
|
|
f"Extracted JSON content using pattern: {json_content}"
|
|
|
|
)
|
|
|
|
break
|
|
|
|
|
|
|
|
if json_content:
|
|
|
|
try:
|
|
|
|
# Clean up the JSON content
|
|
|
|
json_content = json_content.strip()
|
|
|
|
plan_json = json.loads(json_content)
|
|
|
|
logger.info(f"Successfully parsed JSON: {plan_json}")
|
|
|
|
plan = InvestigationPlan(**plan_json)
|
|
|
|
logger.info(
|
|
|
|
"Successfully created InvestigationPlan from JSON"
|
|
|
|
)
|
|
|
|
except json.JSONDecodeError as e:
|
|
|
|
logger.error(f"JSON decode error: {e}")
|
|
|
|
logger.error(f"Failed JSON content: {json_content}")
|
|
|
|
logger.warning(
|
|
|
|
"Could not parse JSON from planning agent response, using fallback"
|
|
|
|
)
|
|
|
|
plan = InvestigationPlan(
|
|
|
|
steps=[
|
|
|
|
"Investigate the reported issue",
|
|
|
|
"Analyze findings and provide recommendations",
|
|
|
|
],
|
|
|
|
agents_sequence=["metrics_agent", "logs_agent"],
|
|
|
|
complexity="simple",
|
|
|
|
auto_execute=True,
|
|
|
|
reasoning="Default investigation plan due to JSON parsing error",
|
|
|
|
)
|
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Error creating InvestigationPlan: {e}")
|
|
|
|
logger.error(f"Plan JSON was: {plan_json}")
|
|
|
|
logger.warning(
|
|
|
|
"Could not create InvestigationPlan from parsed JSON, using fallback"
|
|
|
|
)
|
|
|
|
plan = InvestigationPlan(
|
|
|
|
steps=[
|
|
|
|
"Investigate the reported issue",
|
|
|
|
"Analyze findings and provide recommendations",
|
|
|
|
],
|
|
|
|
agents_sequence=["metrics_agent", "logs_agent"],
|
|
|
|
complexity="simple",
|
|
|
|
auto_execute=True,
|
|
|
|
reasoning="Default investigation plan due to validation error",
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
# Fallback to basic plan if JSON parsing fails
|
|
|
|
logger.warning(
|
|
|
|
"Could not find JSON pattern in planning agent response, using fallback"
|
|
|
|
)
|
|
|
|
logger.warning(f"Response content was: {plan_text}")
|
|
|
|
plan = InvestigationPlan(
|
|
|
|
steps=[
|
|
|
|
"Investigate the reported issue",
|
|
|
|
"Analyze findings and provide recommendations",
|
|
|
|
],
|
|
|
|
agents_sequence=["metrics_agent", "logs_agent"],
|
|
|
|
complexity="simple",
|
|
|
|
auto_execute=True,
|
|
|
|
reasoning="Default investigation plan due to no JSON found",
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
raise ValueError("No response from planning agent")
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
logger.error(
|
|
|
|
f"Error using planning agent with memory tools: {e}", exc_info=True
|
|
|
|
)
|
|
|
|
# Fallback to structured output without tools
|
|
|
|
structured_llm = self.llm.with_structured_output(InvestigationPlan)
|
|
|
|
plan = await structured_llm.ainvoke(
|
|
|
|
[
|
|
|
|
SystemMessage(content=planning_prompt),
|
|
|
|
HumanMessage(content=current_query),
|
|
|
|
]
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
# Fallback to structured output without memory tools
|
|
|
|
structured_llm = self.llm.with_structured_output(InvestigationPlan)
|
|
|
|
plan = await structured_llm.ainvoke(
|
|
|
|
[
|
|
|
|
SystemMessage(content=planning_prompt),
|
|
|
|
HumanMessage(content=current_query),
|
|
|
|
]
|
|
|
|
)
|
2025-07-16 14:07:30 -04:00
|
|
|
|
|
|
|
logger.info(
|
|
|
|
f"Created investigation plan: {len(plan.steps)} steps, complexity: {plan.complexity}"
|
|
|
|
)
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
|
|
|
|
# Store conversation in memory
|
|
|
|
if self.conversation_manager and user_id and session_id:
|
|
|
|
try:
|
|
|
|
# Get supervisor display name with fallback
|
|
|
|
supervisor_name = getattr(SREConstants.agents, "supervisor", None)
|
|
|
|
if supervisor_name:
|
|
|
|
supervisor_display_name = supervisor_name.display_name
|
|
|
|
else:
|
|
|
|
supervisor_display_name = "Supervisor Agent"
|
|
|
|
|
|
|
|
messages_to_store = [
|
|
|
|
(current_query, "USER"),
|
|
|
|
(
|
|
|
|
f"[Agent: {supervisor_display_name}]\nInvestigation Plan:\n{self._format_plan_markdown(plan)}",
|
|
|
|
"ASSISTANT",
|
|
|
|
),
|
|
|
|
]
|
|
|
|
|
|
|
|
success = self.conversation_manager.store_conversation_batch(
|
|
|
|
messages=messages_to_store,
|
|
|
|
user_id=user_id,
|
|
|
|
session_id=session_id,
|
|
|
|
agent_name=supervisor_display_name,
|
|
|
|
)
|
|
|
|
|
|
|
|
if success:
|
|
|
|
logger.info("Supervisor: Successfully stored planning conversation")
|
|
|
|
else:
|
|
|
|
logger.warning("Supervisor: Failed to store planning conversation")
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
logger.error(
|
|
|
|
f"Supervisor: Error storing planning conversation: {e}",
|
|
|
|
exc_info=True,
|
|
|
|
)
|
|
|
|
|
2025-07-16 14:07:30 -04:00
|
|
|
return plan
|
|
|
|
|
|
|
|
def _format_plan_markdown(self, plan: InvestigationPlan) -> str:
|
|
|
|
"""Format investigation plan as properly formatted markdown."""
|
|
|
|
plan_text = "## 🔍 Investigation Plan\n\n"
|
|
|
|
|
|
|
|
# Add steps with proper numbering and formatting
|
|
|
|
for i, step in enumerate(plan.steps, 1):
|
|
|
|
plan_text += f"**{i}.** {step}\n\n"
|
|
|
|
|
|
|
|
# Add metadata
|
|
|
|
plan_text += f"**📊 Complexity:** {plan.complexity.title()}\n"
|
|
|
|
plan_text += f"**🤖 Auto-execute:** {'Yes' if plan.auto_execute else 'No'}\n"
|
|
|
|
if plan.reasoning:
|
|
|
|
plan_text += f"**💭 Reasoning:** {plan.reasoning}\n"
|
|
|
|
|
|
|
|
# Add agents involved
|
|
|
|
if plan.agents_sequence:
|
|
|
|
agents_list = ", ".join(
|
|
|
|
[agent.replace("_", " ").title() for agent in plan.agents_sequence]
|
|
|
|
)
|
|
|
|
plan_text += f"**👥 Agents involved:** {agents_list}\n"
|
|
|
|
|
|
|
|
return plan_text
|
|
|
|
|
|
|
|
async def route(self, state: AgentState) -> Dict[str, Any]:
|
|
|
|
"""Determine which agent should handle the query next."""
|
|
|
|
agents_invoked = state.get("agents_invoked", [])
|
|
|
|
|
|
|
|
# Check if we have an existing plan
|
|
|
|
existing_plan = state.get("metadata", {}).get("investigation_plan")
|
|
|
|
|
|
|
|
if not existing_plan:
|
|
|
|
# First time - create investigation plan
|
|
|
|
plan = await self.create_investigation_plan(state)
|
|
|
|
|
fix(SRE Agent)- Deploy SRE Agent on Amazon Bedrock AgentCore Runtime with Enhanced Architecture (#158)
* feat: Deploy SRE agent on Amazon Bedrock AgentCore Runtime
- Add agent_runtime.py with FastAPI endpoints for AgentCore compatibility
- Create Dockerfile for ARM64-based containerization
- Add deployment scripts for automated ECR push and AgentCore deployment
- Update backend API URLs from placeholders to actual endpoints
- Update gateway configuration for production use
- Add dependencies for AgentCore runtime support
Implements #143
* chore: Add deployment artifacts to .gitignore
- Add deployment/.sre_agent_uri, deployment/.env, and deployment/.agent_arn to .gitignore
- Remove already tracked deployment artifacts from git
* feat: Make ANTHROPIC_API_KEY optional in deployment
- Update deploy_agent_runtime.py to conditionally include ANTHROPIC_API_KEY
- Show info message when using Amazon Bedrock as provider
- Update .env.example to clarify ANTHROPIC_API_KEY is optional
- Only include ANTHROPIC_API_KEY in environment variables if it exists
* fix: Use uv run python instead of python in build script
- Update build_and_deploy.sh to use 'uv run python' for deployment
- Change to parent directory to ensure uv environment is available
- Fixes 'python: command not found' error during deployment
* refactor: Improve deployment script structure and create .env symlink
- Flatten nested if-else blocks in deploy_agent_runtime.py for better readability
- Add 10-second sleep after deletion to ensure cleanup completes
- Create symlink from deployment/.env to sre_agent/.env to avoid duplication
- Move time import to top of file with other imports
* feat: Add debug mode support and comprehensive deployment guide
Add --debug command line flag and DEBUG environment variable support:
- Created shared logging configuration module
- Updated CLI and runtime to support --debug flag
- Made debug traces conditional on DEBUG environment variable
- Added debug mode for container and AgentCore deployments
Enhanced build and deployment script:
- Added command line argument for ECR repository name
- Added help documentation and usage examples
- Added support for local builds (x86_64) vs AgentCore builds (arm64)
- Added environment variable pass-through for DEBUG, LLM_PROVIDER, ANTHROPIC_API_KEY
Created comprehensive deployment guide:
- Step-by-step instructions from local testing to production
- Docker platform documentation (x86_64 vs arm64)
- Environment variable configuration with .env file usage
- Debug mode examples and troubleshooting guide
- Provider configuration for Bedrock and Anthropic
Updated README with AgentCore Runtime deployment section and documentation links.
* docs: Update SRE Agent README with deployment flow diagram and fix directory reference
- Fix reference from 04-SRE-agent to SRE-agent in README
- Add comprehensive flowchart showing development to production deployment flow
- Update overview to mention Amazon Bedrock AgentCore Runtime deployment
- Remove emojis from documentation for professional appearance
* docs: Replace mermaid diagram with ASCII step-by-step flow diagram
- Change from block-style mermaid diagram to ASCII flow diagram
- Show clear step-by-step progression from development to production
- Improve readability with structured boxes and arrows
- Minor text improvements for clarity
* feat: Implement comprehensive prompt management system and enhance deployment guide
- Create centralized prompt template system with external files in config/prompts/
- Add PromptLoader utility class with LRU caching and template variable substitution
- Integrate PromptConfig into SREConstants for centralized configuration management
- Update all agents (nodes, supervisor, output_formatter) to use prompt loader
- Replace 150+ lines of hardcoded prompts with modular, maintainable template system
- Enhance deployment guide with consistent naming (my_custom_sre_agent) throughout
- Add quick-start copy-paste command sequence for streamlined deployment
- Improve constants system with comprehensive model, AWS, timeout, and prompt configs
- Add architectural assessment document to .gitignore for local analysis
- Run black formatting across all updated Python files
* docs: Consolidate deployment and security documentation
- Rename deployment-and-security.md to security.md and remove redundant deployment content
- Enhance security.md with comprehensive production security guidelines including:
- Authentication and authorization best practices
- Encryption and data protection requirements
- Operational security monitoring and logging
- Input validation and prompt security measures
- Infrastructure security recommendations
- Compliance and governance frameworks
- Update README.md to reference new security.md file
- Eliminate redundancy between deployment-guide.md and deployment-and-security.md
- Improve documentation organization with clear separation of concerns
* config: Replace hardcoded endpoints with placeholder domains
- Update OpenAPI specifications to use placeholder domain 'your-backend-domain.com'
- k8s_api.yaml: mcpgateway.ddns.net:8011 -> your-backend-domain.com:8011
- logs_api.yaml: mcpgateway.ddns.net:8012 -> your-backend-domain.com:8012
- metrics_api.yaml: mcpgateway.ddns.net:8013 -> your-backend-domain.com:8013
- runbooks_api.yaml: mcpgateway.ddns.net:8014 -> your-backend-domain.com:8014
- Update agent configuration to use placeholder AgentCore gateway endpoint
- agent_config.yaml: Replace specific gateway ID with 'your-agentcore-gateway-endpoint'
- Improve security by removing hardcoded production endpoints from repository
- Enable template-based configuration that users can customize during setup
- Align with existing documentation patterns for placeholder domain replacement
2025-07-27 15:05:03 -04:00
|
|
|
# Check if we should auto-approve the plan (defaults to False if not set)
|
|
|
|
auto_approve = state.get("auto_approve_plan", False)
|
|
|
|
|
|
|
|
if not plan.auto_execute and not auto_approve:
|
2025-07-16 14:07:30 -04:00
|
|
|
# Complex plan - present to user for approval
|
|
|
|
plan_text = self._format_plan_markdown(plan)
|
|
|
|
return {
|
|
|
|
"next": "FINISH",
|
|
|
|
"metadata": {
|
|
|
|
**state.get("metadata", {}),
|
|
|
|
"investigation_plan": plan.model_dump(),
|
|
|
|
"routing_reasoning": f"Created investigation plan. Complexity: {plan.complexity}",
|
|
|
|
"plan_pending_approval": True,
|
|
|
|
"plan_text": plan_text,
|
|
|
|
},
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
# Preserve memory context in state
|
|
|
|
"memory_context": state.get("memory_context", {}),
|
2025-07-16 14:07:30 -04:00
|
|
|
}
|
|
|
|
else:
|
|
|
|
# Simple plan - start execution
|
|
|
|
next_agent = (
|
|
|
|
plan.agents_sequence[0] if plan.agents_sequence else "FINISH"
|
|
|
|
)
|
|
|
|
plan_text = self._format_plan_markdown(plan)
|
|
|
|
return {
|
|
|
|
"next": next_agent,
|
|
|
|
"metadata": {
|
|
|
|
**state.get("metadata", {}),
|
|
|
|
"investigation_plan": plan.model_dump(),
|
|
|
|
"routing_reasoning": f"Executing plan step 1: {plan.steps[0] if plan.steps else 'Start'}",
|
|
|
|
"plan_step": 0,
|
|
|
|
"plan_text": plan_text,
|
|
|
|
"show_plan": True,
|
|
|
|
},
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
# Preserve memory context in state
|
|
|
|
"memory_context": state.get("memory_context", {}),
|
2025-07-16 14:07:30 -04:00
|
|
|
}
|
|
|
|
else:
|
|
|
|
# Continue executing existing plan
|
|
|
|
plan = InvestigationPlan(**existing_plan)
|
|
|
|
current_step = state.get("metadata", {}).get("plan_step", 0)
|
|
|
|
|
|
|
|
# Check if plan is complete
|
|
|
|
if current_step >= len(plan.agents_sequence) or not agents_invoked:
|
|
|
|
next_step = current_step
|
|
|
|
else:
|
|
|
|
next_step = current_step + 1
|
|
|
|
|
|
|
|
if next_step >= len(plan.agents_sequence):
|
|
|
|
# Plan complete
|
|
|
|
return {
|
|
|
|
"next": "FINISH",
|
|
|
|
"metadata": {
|
|
|
|
**state.get("metadata", {}),
|
|
|
|
"routing_reasoning": "Investigation plan completed. Presenting results.",
|
|
|
|
"plan_step": next_step,
|
|
|
|
},
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
# Preserve memory context in state
|
|
|
|
"memory_context": state.get("memory_context", {}),
|
2025-07-16 14:07:30 -04:00
|
|
|
}
|
|
|
|
else:
|
|
|
|
# Continue with next agent in plan
|
|
|
|
next_agent = plan.agents_sequence[next_step]
|
|
|
|
step_description = (
|
|
|
|
plan.steps[next_step]
|
|
|
|
if next_step < len(plan.steps)
|
|
|
|
else f"Execute {next_agent}"
|
|
|
|
)
|
|
|
|
|
|
|
|
return {
|
|
|
|
"next": next_agent,
|
|
|
|
"metadata": {
|
|
|
|
**state.get("metadata", {}),
|
|
|
|
"routing_reasoning": f"Executing plan step {next_step + 1}: {step_description}",
|
|
|
|
"plan_step": next_step,
|
|
|
|
},
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
# Preserve memory context in state
|
|
|
|
"memory_context": state.get("memory_context", {}),
|
2025-07-16 14:07:30 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
async def aggregate_responses(self, state: AgentState) -> Dict[str, Any]:
|
|
|
|
"""Aggregate responses from multiple agents into a final response."""
|
|
|
|
agent_results = state.get("agent_results", {})
|
|
|
|
metadata = state.get("metadata", {})
|
|
|
|
|
|
|
|
# Check if this is a plan approval request
|
|
|
|
if metadata.get("plan_pending_approval"):
|
|
|
|
plan = metadata.get("investigation_plan", {})
|
fix(SRE Agent)- Deploy SRE Agent on Amazon Bedrock AgentCore Runtime with Enhanced Architecture (#158)
* feat: Deploy SRE agent on Amazon Bedrock AgentCore Runtime
- Add agent_runtime.py with FastAPI endpoints for AgentCore compatibility
- Create Dockerfile for ARM64-based containerization
- Add deployment scripts for automated ECR push and AgentCore deployment
- Update backend API URLs from placeholders to actual endpoints
- Update gateway configuration for production use
- Add dependencies for AgentCore runtime support
Implements #143
* chore: Add deployment artifacts to .gitignore
- Add deployment/.sre_agent_uri, deployment/.env, and deployment/.agent_arn to .gitignore
- Remove already tracked deployment artifacts from git
* feat: Make ANTHROPIC_API_KEY optional in deployment
- Update deploy_agent_runtime.py to conditionally include ANTHROPIC_API_KEY
- Show info message when using Amazon Bedrock as provider
- Update .env.example to clarify ANTHROPIC_API_KEY is optional
- Only include ANTHROPIC_API_KEY in environment variables if it exists
* fix: Use uv run python instead of python in build script
- Update build_and_deploy.sh to use 'uv run python' for deployment
- Change to parent directory to ensure uv environment is available
- Fixes 'python: command not found' error during deployment
* refactor: Improve deployment script structure and create .env symlink
- Flatten nested if-else blocks in deploy_agent_runtime.py for better readability
- Add 10-second sleep after deletion to ensure cleanup completes
- Create symlink from deployment/.env to sre_agent/.env to avoid duplication
- Move time import to top of file with other imports
* feat: Add debug mode support and comprehensive deployment guide
Add --debug command line flag and DEBUG environment variable support:
- Created shared logging configuration module
- Updated CLI and runtime to support --debug flag
- Made debug traces conditional on DEBUG environment variable
- Added debug mode for container and AgentCore deployments
Enhanced build and deployment script:
- Added command line argument for ECR repository name
- Added help documentation and usage examples
- Added support for local builds (x86_64) vs AgentCore builds (arm64)
- Added environment variable pass-through for DEBUG, LLM_PROVIDER, ANTHROPIC_API_KEY
Created comprehensive deployment guide:
- Step-by-step instructions from local testing to production
- Docker platform documentation (x86_64 vs arm64)
- Environment variable configuration with .env file usage
- Debug mode examples and troubleshooting guide
- Provider configuration for Bedrock and Anthropic
Updated README with AgentCore Runtime deployment section and documentation links.
* docs: Update SRE Agent README with deployment flow diagram and fix directory reference
- Fix reference from 04-SRE-agent to SRE-agent in README
- Add comprehensive flowchart showing development to production deployment flow
- Update overview to mention Amazon Bedrock AgentCore Runtime deployment
- Remove emojis from documentation for professional appearance
* docs: Replace mermaid diagram with ASCII step-by-step flow diagram
- Change from block-style mermaid diagram to ASCII flow diagram
- Show clear step-by-step progression from development to production
- Improve readability with structured boxes and arrows
- Minor text improvements for clarity
* feat: Implement comprehensive prompt management system and enhance deployment guide
- Create centralized prompt template system with external files in config/prompts/
- Add PromptLoader utility class with LRU caching and template variable substitution
- Integrate PromptConfig into SREConstants for centralized configuration management
- Update all agents (nodes, supervisor, output_formatter) to use prompt loader
- Replace 150+ lines of hardcoded prompts with modular, maintainable template system
- Enhance deployment guide with consistent naming (my_custom_sre_agent) throughout
- Add quick-start copy-paste command sequence for streamlined deployment
- Improve constants system with comprehensive model, AWS, timeout, and prompt configs
- Add architectural assessment document to .gitignore for local analysis
- Run black formatting across all updated Python files
* docs: Consolidate deployment and security documentation
- Rename deployment-and-security.md to security.md and remove redundant deployment content
- Enhance security.md with comprehensive production security guidelines including:
- Authentication and authorization best practices
- Encryption and data protection requirements
- Operational security monitoring and logging
- Input validation and prompt security measures
- Infrastructure security recommendations
- Compliance and governance frameworks
- Update README.md to reference new security.md file
- Eliminate redundancy between deployment-guide.md and deployment-and-security.md
- Improve documentation organization with clear separation of concerns
* config: Replace hardcoded endpoints with placeholder domains
- Update OpenAPI specifications to use placeholder domain 'your-backend-domain.com'
- k8s_api.yaml: mcpgateway.ddns.net:8011 -> your-backend-domain.com:8011
- logs_api.yaml: mcpgateway.ddns.net:8012 -> your-backend-domain.com:8012
- metrics_api.yaml: mcpgateway.ddns.net:8013 -> your-backend-domain.com:8013
- runbooks_api.yaml: mcpgateway.ddns.net:8014 -> your-backend-domain.com:8014
- Update agent configuration to use placeholder AgentCore gateway endpoint
- agent_config.yaml: Replace specific gateway ID with 'your-agentcore-gateway-endpoint'
- Improve security by removing hardcoded production endpoints from repository
- Enable template-based configuration that users can customize during setup
- Align with existing documentation patterns for placeholder domain replacement
2025-07-27 15:05:03 -04:00
|
|
|
query = state.get("current_query", "Investigation") or "Investigation"
|
2025-07-16 14:07:30 -04:00
|
|
|
|
|
|
|
# Use enhanced formatting for plan approval
|
|
|
|
try:
|
|
|
|
approval_response = self.formatter.format_plan_approval(plan, query)
|
|
|
|
except Exception as e:
|
|
|
|
logger.warning(
|
|
|
|
f"Failed to use enhanced formatting: {e}, falling back to plain text"
|
|
|
|
)
|
|
|
|
plan_text = metadata.get("plan_text", "")
|
|
|
|
approval_response = f"""## Investigation Plan
|
|
|
|
|
|
|
|
I've analyzed your query and created the following investigation plan:
|
|
|
|
|
|
|
|
{plan_text}
|
|
|
|
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
**Complexity:** {plan.get("complexity", "unknown").title()}
|
|
|
|
**Reasoning:** {plan.get("reasoning", "Standard investigation approach")}
|
2025-07-16 14:07:30 -04:00
|
|
|
|
|
|
|
This plan will help systematically investigate your issue. Would you like me to proceed with this plan, or would you prefer to modify it?
|
|
|
|
|
|
|
|
You can:
|
|
|
|
- Type "proceed" or "yes" to execute the plan
|
|
|
|
- Type "modify" to suggest changes
|
|
|
|
- Ask specific questions about any step"""
|
|
|
|
|
|
|
|
return {"final_response": approval_response, "next": "FINISH"}
|
|
|
|
|
|
|
|
if not agent_results:
|
|
|
|
return {"final_response": "No agent responses to aggregate."}
|
|
|
|
|
|
|
|
# Use enhanced formatting for investigation results
|
fix(SRE Agent)- Deploy SRE Agent on Amazon Bedrock AgentCore Runtime with Enhanced Architecture (#158)
* feat: Deploy SRE agent on Amazon Bedrock AgentCore Runtime
- Add agent_runtime.py with FastAPI endpoints for AgentCore compatibility
- Create Dockerfile for ARM64-based containerization
- Add deployment scripts for automated ECR push and AgentCore deployment
- Update backend API URLs from placeholders to actual endpoints
- Update gateway configuration for production use
- Add dependencies for AgentCore runtime support
Implements #143
* chore: Add deployment artifacts to .gitignore
- Add deployment/.sre_agent_uri, deployment/.env, and deployment/.agent_arn to .gitignore
- Remove already tracked deployment artifacts from git
* feat: Make ANTHROPIC_API_KEY optional in deployment
- Update deploy_agent_runtime.py to conditionally include ANTHROPIC_API_KEY
- Show info message when using Amazon Bedrock as provider
- Update .env.example to clarify ANTHROPIC_API_KEY is optional
- Only include ANTHROPIC_API_KEY in environment variables if it exists
* fix: Use uv run python instead of python in build script
- Update build_and_deploy.sh to use 'uv run python' for deployment
- Change to parent directory to ensure uv environment is available
- Fixes 'python: command not found' error during deployment
* refactor: Improve deployment script structure and create .env symlink
- Flatten nested if-else blocks in deploy_agent_runtime.py for better readability
- Add 10-second sleep after deletion to ensure cleanup completes
- Create symlink from deployment/.env to sre_agent/.env to avoid duplication
- Move time import to top of file with other imports
* feat: Add debug mode support and comprehensive deployment guide
Add --debug command line flag and DEBUG environment variable support:
- Created shared logging configuration module
- Updated CLI and runtime to support --debug flag
- Made debug traces conditional on DEBUG environment variable
- Added debug mode for container and AgentCore deployments
Enhanced build and deployment script:
- Added command line argument for ECR repository name
- Added help documentation and usage examples
- Added support for local builds (x86_64) vs AgentCore builds (arm64)
- Added environment variable pass-through for DEBUG, LLM_PROVIDER, ANTHROPIC_API_KEY
Created comprehensive deployment guide:
- Step-by-step instructions from local testing to production
- Docker platform documentation (x86_64 vs arm64)
- Environment variable configuration with .env file usage
- Debug mode examples and troubleshooting guide
- Provider configuration for Bedrock and Anthropic
Updated README with AgentCore Runtime deployment section and documentation links.
* docs: Update SRE Agent README with deployment flow diagram and fix directory reference
- Fix reference from 04-SRE-agent to SRE-agent in README
- Add comprehensive flowchart showing development to production deployment flow
- Update overview to mention Amazon Bedrock AgentCore Runtime deployment
- Remove emojis from documentation for professional appearance
* docs: Replace mermaid diagram with ASCII step-by-step flow diagram
- Change from block-style mermaid diagram to ASCII flow diagram
- Show clear step-by-step progression from development to production
- Improve readability with structured boxes and arrows
- Minor text improvements for clarity
* feat: Implement comprehensive prompt management system and enhance deployment guide
- Create centralized prompt template system with external files in config/prompts/
- Add PromptLoader utility class with LRU caching and template variable substitution
- Integrate PromptConfig into SREConstants for centralized configuration management
- Update all agents (nodes, supervisor, output_formatter) to use prompt loader
- Replace 150+ lines of hardcoded prompts with modular, maintainable template system
- Enhance deployment guide with consistent naming (my_custom_sre_agent) throughout
- Add quick-start copy-paste command sequence for streamlined deployment
- Improve constants system with comprehensive model, AWS, timeout, and prompt configs
- Add architectural assessment document to .gitignore for local analysis
- Run black formatting across all updated Python files
* docs: Consolidate deployment and security documentation
- Rename deployment-and-security.md to security.md and remove redundant deployment content
- Enhance security.md with comprehensive production security guidelines including:
- Authentication and authorization best practices
- Encryption and data protection requirements
- Operational security monitoring and logging
- Input validation and prompt security measures
- Infrastructure security recommendations
- Compliance and governance frameworks
- Update README.md to reference new security.md file
- Eliminate redundancy between deployment-guide.md and deployment-and-security.md
- Improve documentation organization with clear separation of concerns
* config: Replace hardcoded endpoints with placeholder domains
- Update OpenAPI specifications to use placeholder domain 'your-backend-domain.com'
- k8s_api.yaml: mcpgateway.ddns.net:8011 -> your-backend-domain.com:8011
- logs_api.yaml: mcpgateway.ddns.net:8012 -> your-backend-domain.com:8012
- metrics_api.yaml: mcpgateway.ddns.net:8013 -> your-backend-domain.com:8013
- runbooks_api.yaml: mcpgateway.ddns.net:8014 -> your-backend-domain.com:8014
- Update agent configuration to use placeholder AgentCore gateway endpoint
- agent_config.yaml: Replace specific gateway ID with 'your-agentcore-gateway-endpoint'
- Improve security by removing hardcoded production endpoints from repository
- Enable template-based configuration that users can customize during setup
- Align with existing documentation patterns for placeholder domain replacement
2025-07-27 15:05:03 -04:00
|
|
|
query = state.get("current_query", "Investigation") or "Investigation"
|
2025-07-16 14:07:30 -04:00
|
|
|
plan = metadata.get("investigation_plan")
|
|
|
|
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
# Get user preferences from memory_context (not directly from state)
|
|
|
|
user_preferences = []
|
|
|
|
if "memory_context" in state:
|
|
|
|
memory_ctx = state["memory_context"]
|
|
|
|
user_preferences = memory_ctx.get("user_preferences", [])
|
|
|
|
logger.debug(
|
|
|
|
f"Memory context found with {len(user_preferences)} user preferences"
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
logger.debug("No memory_context found in state")
|
|
|
|
|
|
|
|
logger.info(
|
|
|
|
f"Retrieved user preferences from memory_context for aggregation: {len(user_preferences)} items"
|
|
|
|
)
|
|
|
|
logger.debug(f"Full state keys available: {list(state.keys())}")
|
|
|
|
|
2025-07-16 14:07:30 -04:00
|
|
|
try:
|
|
|
|
# Try enhanced formatting first
|
|
|
|
final_response = self.formatter.format_investigation_response(
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
query=query,
|
|
|
|
agent_results=agent_results,
|
|
|
|
metadata=metadata,
|
|
|
|
plan=plan,
|
|
|
|
user_preferences=user_preferences,
|
2025-07-16 14:07:30 -04:00
|
|
|
)
|
|
|
|
except Exception as e:
|
|
|
|
logger.warning(
|
|
|
|
f"Failed to use enhanced formatting: {e}, falling back to LLM aggregation"
|
|
|
|
)
|
|
|
|
|
|
|
|
# Fallback to LLM-based aggregation
|
fix(SRE Agent)- Deploy SRE Agent on Amazon Bedrock AgentCore Runtime with Enhanced Architecture (#158)
* feat: Deploy SRE agent on Amazon Bedrock AgentCore Runtime
- Add agent_runtime.py with FastAPI endpoints for AgentCore compatibility
- Create Dockerfile for ARM64-based containerization
- Add deployment scripts for automated ECR push and AgentCore deployment
- Update backend API URLs from placeholders to actual endpoints
- Update gateway configuration for production use
- Add dependencies for AgentCore runtime support
Implements #143
* chore: Add deployment artifacts to .gitignore
- Add deployment/.sre_agent_uri, deployment/.env, and deployment/.agent_arn to .gitignore
- Remove already tracked deployment artifacts from git
* feat: Make ANTHROPIC_API_KEY optional in deployment
- Update deploy_agent_runtime.py to conditionally include ANTHROPIC_API_KEY
- Show info message when using Amazon Bedrock as provider
- Update .env.example to clarify ANTHROPIC_API_KEY is optional
- Only include ANTHROPIC_API_KEY in environment variables if it exists
* fix: Use uv run python instead of python in build script
- Update build_and_deploy.sh to use 'uv run python' for deployment
- Change to parent directory to ensure uv environment is available
- Fixes 'python: command not found' error during deployment
* refactor: Improve deployment script structure and create .env symlink
- Flatten nested if-else blocks in deploy_agent_runtime.py for better readability
- Add 10-second sleep after deletion to ensure cleanup completes
- Create symlink from deployment/.env to sre_agent/.env to avoid duplication
- Move time import to top of file with other imports
* feat: Add debug mode support and comprehensive deployment guide
Add --debug command line flag and DEBUG environment variable support:
- Created shared logging configuration module
- Updated CLI and runtime to support --debug flag
- Made debug traces conditional on DEBUG environment variable
- Added debug mode for container and AgentCore deployments
Enhanced build and deployment script:
- Added command line argument for ECR repository name
- Added help documentation and usage examples
- Added support for local builds (x86_64) vs AgentCore builds (arm64)
- Added environment variable pass-through for DEBUG, LLM_PROVIDER, ANTHROPIC_API_KEY
Created comprehensive deployment guide:
- Step-by-step instructions from local testing to production
- Docker platform documentation (x86_64 vs arm64)
- Environment variable configuration with .env file usage
- Debug mode examples and troubleshooting guide
- Provider configuration for Bedrock and Anthropic
Updated README with AgentCore Runtime deployment section and documentation links.
* docs: Update SRE Agent README with deployment flow diagram and fix directory reference
- Fix reference from 04-SRE-agent to SRE-agent in README
- Add comprehensive flowchart showing development to production deployment flow
- Update overview to mention Amazon Bedrock AgentCore Runtime deployment
- Remove emojis from documentation for professional appearance
* docs: Replace mermaid diagram with ASCII step-by-step flow diagram
- Change from block-style mermaid diagram to ASCII flow diagram
- Show clear step-by-step progression from development to production
- Improve readability with structured boxes and arrows
- Minor text improvements for clarity
* feat: Implement comprehensive prompt management system and enhance deployment guide
- Create centralized prompt template system with external files in config/prompts/
- Add PromptLoader utility class with LRU caching and template variable substitution
- Integrate PromptConfig into SREConstants for centralized configuration management
- Update all agents (nodes, supervisor, output_formatter) to use prompt loader
- Replace 150+ lines of hardcoded prompts with modular, maintainable template system
- Enhance deployment guide with consistent naming (my_custom_sre_agent) throughout
- Add quick-start copy-paste command sequence for streamlined deployment
- Improve constants system with comprehensive model, AWS, timeout, and prompt configs
- Add architectural assessment document to .gitignore for local analysis
- Run black formatting across all updated Python files
* docs: Consolidate deployment and security documentation
- Rename deployment-and-security.md to security.md and remove redundant deployment content
- Enhance security.md with comprehensive production security guidelines including:
- Authentication and authorization best practices
- Encryption and data protection requirements
- Operational security monitoring and logging
- Input validation and prompt security measures
- Infrastructure security recommendations
- Compliance and governance frameworks
- Update README.md to reference new security.md file
- Eliminate redundancy between deployment-guide.md and deployment-and-security.md
- Improve documentation organization with clear separation of concerns
* config: Replace hardcoded endpoints with placeholder domains
- Update OpenAPI specifications to use placeholder domain 'your-backend-domain.com'
- k8s_api.yaml: mcpgateway.ddns.net:8011 -> your-backend-domain.com:8011
- logs_api.yaml: mcpgateway.ddns.net:8012 -> your-backend-domain.com:8012
- metrics_api.yaml: mcpgateway.ddns.net:8013 -> your-backend-domain.com:8013
- runbooks_api.yaml: mcpgateway.ddns.net:8014 -> your-backend-domain.com:8014
- Update agent configuration to use placeholder AgentCore gateway endpoint
- agent_config.yaml: Replace specific gateway ID with 'your-agentcore-gateway-endpoint'
- Improve security by removing hardcoded production endpoints from repository
- Enable template-based configuration that users can customize during setup
- Align with existing documentation patterns for placeholder domain replacement
2025-07-27 15:05:03 -04:00
|
|
|
try:
|
|
|
|
# Get system message from prompt loader
|
|
|
|
system_prompt = prompt_loader.load_prompt(
|
|
|
|
"supervisor_aggregation_system"
|
|
|
|
)
|
2025-07-16 14:07:30 -04:00
|
|
|
|
fix(SRE Agent)- Deploy SRE Agent on Amazon Bedrock AgentCore Runtime with Enhanced Architecture (#158)
* feat: Deploy SRE agent on Amazon Bedrock AgentCore Runtime
- Add agent_runtime.py with FastAPI endpoints for AgentCore compatibility
- Create Dockerfile for ARM64-based containerization
- Add deployment scripts for automated ECR push and AgentCore deployment
- Update backend API URLs from placeholders to actual endpoints
- Update gateway configuration for production use
- Add dependencies for AgentCore runtime support
Implements #143
* chore: Add deployment artifacts to .gitignore
- Add deployment/.sre_agent_uri, deployment/.env, and deployment/.agent_arn to .gitignore
- Remove already tracked deployment artifacts from git
* feat: Make ANTHROPIC_API_KEY optional in deployment
- Update deploy_agent_runtime.py to conditionally include ANTHROPIC_API_KEY
- Show info message when using Amazon Bedrock as provider
- Update .env.example to clarify ANTHROPIC_API_KEY is optional
- Only include ANTHROPIC_API_KEY in environment variables if it exists
* fix: Use uv run python instead of python in build script
- Update build_and_deploy.sh to use 'uv run python' for deployment
- Change to parent directory to ensure uv environment is available
- Fixes 'python: command not found' error during deployment
* refactor: Improve deployment script structure and create .env symlink
- Flatten nested if-else blocks in deploy_agent_runtime.py for better readability
- Add 10-second sleep after deletion to ensure cleanup completes
- Create symlink from deployment/.env to sre_agent/.env to avoid duplication
- Move time import to top of file with other imports
* feat: Add debug mode support and comprehensive deployment guide
Add --debug command line flag and DEBUG environment variable support:
- Created shared logging configuration module
- Updated CLI and runtime to support --debug flag
- Made debug traces conditional on DEBUG environment variable
- Added debug mode for container and AgentCore deployments
Enhanced build and deployment script:
- Added command line argument for ECR repository name
- Added help documentation and usage examples
- Added support for local builds (x86_64) vs AgentCore builds (arm64)
- Added environment variable pass-through for DEBUG, LLM_PROVIDER, ANTHROPIC_API_KEY
Created comprehensive deployment guide:
- Step-by-step instructions from local testing to production
- Docker platform documentation (x86_64 vs arm64)
- Environment variable configuration with .env file usage
- Debug mode examples and troubleshooting guide
- Provider configuration for Bedrock and Anthropic
Updated README with AgentCore Runtime deployment section and documentation links.
* docs: Update SRE Agent README with deployment flow diagram and fix directory reference
- Fix reference from 04-SRE-agent to SRE-agent in README
- Add comprehensive flowchart showing development to production deployment flow
- Update overview to mention Amazon Bedrock AgentCore Runtime deployment
- Remove emojis from documentation for professional appearance
* docs: Replace mermaid diagram with ASCII step-by-step flow diagram
- Change from block-style mermaid diagram to ASCII flow diagram
- Show clear step-by-step progression from development to production
- Improve readability with structured boxes and arrows
- Minor text improvements for clarity
* feat: Implement comprehensive prompt management system and enhance deployment guide
- Create centralized prompt template system with external files in config/prompts/
- Add PromptLoader utility class with LRU caching and template variable substitution
- Integrate PromptConfig into SREConstants for centralized configuration management
- Update all agents (nodes, supervisor, output_formatter) to use prompt loader
- Replace 150+ lines of hardcoded prompts with modular, maintainable template system
- Enhance deployment guide with consistent naming (my_custom_sre_agent) throughout
- Add quick-start copy-paste command sequence for streamlined deployment
- Improve constants system with comprehensive model, AWS, timeout, and prompt configs
- Add architectural assessment document to .gitignore for local analysis
- Run black formatting across all updated Python files
* docs: Consolidate deployment and security documentation
- Rename deployment-and-security.md to security.md and remove redundant deployment content
- Enhance security.md with comprehensive production security guidelines including:
- Authentication and authorization best practices
- Encryption and data protection requirements
- Operational security monitoring and logging
- Input validation and prompt security measures
- Infrastructure security recommendations
- Compliance and governance frameworks
- Update README.md to reference new security.md file
- Eliminate redundancy between deployment-guide.md and deployment-and-security.md
- Improve documentation organization with clear separation of concerns
* config: Replace hardcoded endpoints with placeholder domains
- Update OpenAPI specifications to use placeholder domain 'your-backend-domain.com'
- k8s_api.yaml: mcpgateway.ddns.net:8011 -> your-backend-domain.com:8011
- logs_api.yaml: mcpgateway.ddns.net:8012 -> your-backend-domain.com:8012
- metrics_api.yaml: mcpgateway.ddns.net:8013 -> your-backend-domain.com:8013
- runbooks_api.yaml: mcpgateway.ddns.net:8014 -> your-backend-domain.com:8014
- Update agent configuration to use placeholder AgentCore gateway endpoint
- agent_config.yaml: Replace specific gateway ID with 'your-agentcore-gateway-endpoint'
- Improve security by removing hardcoded production endpoints from repository
- Enable template-based configuration that users can customize during setup
- Align with existing documentation patterns for placeholder domain replacement
2025-07-27 15:05:03 -04:00
|
|
|
# Determine if this is plan-based or standard aggregation
|
|
|
|
is_plan_based = plan is not None
|
2025-07-16 14:07:30 -04:00
|
|
|
|
fix(SRE Agent)- Deploy SRE Agent on Amazon Bedrock AgentCore Runtime with Enhanced Architecture (#158)
* feat: Deploy SRE agent on Amazon Bedrock AgentCore Runtime
- Add agent_runtime.py with FastAPI endpoints for AgentCore compatibility
- Create Dockerfile for ARM64-based containerization
- Add deployment scripts for automated ECR push and AgentCore deployment
- Update backend API URLs from placeholders to actual endpoints
- Update gateway configuration for production use
- Add dependencies for AgentCore runtime support
Implements #143
* chore: Add deployment artifacts to .gitignore
- Add deployment/.sre_agent_uri, deployment/.env, and deployment/.agent_arn to .gitignore
- Remove already tracked deployment artifacts from git
* feat: Make ANTHROPIC_API_KEY optional in deployment
- Update deploy_agent_runtime.py to conditionally include ANTHROPIC_API_KEY
- Show info message when using Amazon Bedrock as provider
- Update .env.example to clarify ANTHROPIC_API_KEY is optional
- Only include ANTHROPIC_API_KEY in environment variables if it exists
* fix: Use uv run python instead of python in build script
- Update build_and_deploy.sh to use 'uv run python' for deployment
- Change to parent directory to ensure uv environment is available
- Fixes 'python: command not found' error during deployment
* refactor: Improve deployment script structure and create .env symlink
- Flatten nested if-else blocks in deploy_agent_runtime.py for better readability
- Add 10-second sleep after deletion to ensure cleanup completes
- Create symlink from deployment/.env to sre_agent/.env to avoid duplication
- Move time import to top of file with other imports
* feat: Add debug mode support and comprehensive deployment guide
Add --debug command line flag and DEBUG environment variable support:
- Created shared logging configuration module
- Updated CLI and runtime to support --debug flag
- Made debug traces conditional on DEBUG environment variable
- Added debug mode for container and AgentCore deployments
Enhanced build and deployment script:
- Added command line argument for ECR repository name
- Added help documentation and usage examples
- Added support for local builds (x86_64) vs AgentCore builds (arm64)
- Added environment variable pass-through for DEBUG, LLM_PROVIDER, ANTHROPIC_API_KEY
Created comprehensive deployment guide:
- Step-by-step instructions from local testing to production
- Docker platform documentation (x86_64 vs arm64)
- Environment variable configuration with .env file usage
- Debug mode examples and troubleshooting guide
- Provider configuration for Bedrock and Anthropic
Updated README with AgentCore Runtime deployment section and documentation links.
* docs: Update SRE Agent README with deployment flow diagram and fix directory reference
- Fix reference from 04-SRE-agent to SRE-agent in README
- Add comprehensive flowchart showing development to production deployment flow
- Update overview to mention Amazon Bedrock AgentCore Runtime deployment
- Remove emojis from documentation for professional appearance
* docs: Replace mermaid diagram with ASCII step-by-step flow diagram
- Change from block-style mermaid diagram to ASCII flow diagram
- Show clear step-by-step progression from development to production
- Improve readability with structured boxes and arrows
- Minor text improvements for clarity
* feat: Implement comprehensive prompt management system and enhance deployment guide
- Create centralized prompt template system with external files in config/prompts/
- Add PromptLoader utility class with LRU caching and template variable substitution
- Integrate PromptConfig into SREConstants for centralized configuration management
- Update all agents (nodes, supervisor, output_formatter) to use prompt loader
- Replace 150+ lines of hardcoded prompts with modular, maintainable template system
- Enhance deployment guide with consistent naming (my_custom_sre_agent) throughout
- Add quick-start copy-paste command sequence for streamlined deployment
- Improve constants system with comprehensive model, AWS, timeout, and prompt configs
- Add architectural assessment document to .gitignore for local analysis
- Run black formatting across all updated Python files
* docs: Consolidate deployment and security documentation
- Rename deployment-and-security.md to security.md and remove redundant deployment content
- Enhance security.md with comprehensive production security guidelines including:
- Authentication and authorization best practices
- Encryption and data protection requirements
- Operational security monitoring and logging
- Input validation and prompt security measures
- Infrastructure security recommendations
- Compliance and governance frameworks
- Update README.md to reference new security.md file
- Eliminate redundancy between deployment-guide.md and deployment-and-security.md
- Improve documentation organization with clear separation of concerns
* config: Replace hardcoded endpoints with placeholder domains
- Update OpenAPI specifications to use placeholder domain 'your-backend-domain.com'
- k8s_api.yaml: mcpgateway.ddns.net:8011 -> your-backend-domain.com:8011
- logs_api.yaml: mcpgateway.ddns.net:8012 -> your-backend-domain.com:8012
- metrics_api.yaml: mcpgateway.ddns.net:8013 -> your-backend-domain.com:8013
- runbooks_api.yaml: mcpgateway.ddns.net:8014 -> your-backend-domain.com:8014
- Update agent configuration to use placeholder AgentCore gateway endpoint
- agent_config.yaml: Replace specific gateway ID with 'your-agentcore-gateway-endpoint'
- Improve security by removing hardcoded production endpoints from repository
- Enable template-based configuration that users can customize during setup
- Align with existing documentation patterns for placeholder domain replacement
2025-07-27 15:05:03 -04:00
|
|
|
# Prepare template variables
|
|
|
|
query = (
|
|
|
|
state.get("current_query", "No query provided")
|
|
|
|
or "No query provided"
|
|
|
|
)
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
agent_results_json = json.dumps(
|
|
|
|
agent_results, indent=2, default=_json_serializer
|
|
|
|
)
|
fix(SRE Agent)- Deploy SRE Agent on Amazon Bedrock AgentCore Runtime with Enhanced Architecture (#158)
* feat: Deploy SRE agent on Amazon Bedrock AgentCore Runtime
- Add agent_runtime.py with FastAPI endpoints for AgentCore compatibility
- Create Dockerfile for ARM64-based containerization
- Add deployment scripts for automated ECR push and AgentCore deployment
- Update backend API URLs from placeholders to actual endpoints
- Update gateway configuration for production use
- Add dependencies for AgentCore runtime support
Implements #143
* chore: Add deployment artifacts to .gitignore
- Add deployment/.sre_agent_uri, deployment/.env, and deployment/.agent_arn to .gitignore
- Remove already tracked deployment artifacts from git
* feat: Make ANTHROPIC_API_KEY optional in deployment
- Update deploy_agent_runtime.py to conditionally include ANTHROPIC_API_KEY
- Show info message when using Amazon Bedrock as provider
- Update .env.example to clarify ANTHROPIC_API_KEY is optional
- Only include ANTHROPIC_API_KEY in environment variables if it exists
* fix: Use uv run python instead of python in build script
- Update build_and_deploy.sh to use 'uv run python' for deployment
- Change to parent directory to ensure uv environment is available
- Fixes 'python: command not found' error during deployment
* refactor: Improve deployment script structure and create .env symlink
- Flatten nested if-else blocks in deploy_agent_runtime.py for better readability
- Add 10-second sleep after deletion to ensure cleanup completes
- Create symlink from deployment/.env to sre_agent/.env to avoid duplication
- Move time import to top of file with other imports
* feat: Add debug mode support and comprehensive deployment guide
Add --debug command line flag and DEBUG environment variable support:
- Created shared logging configuration module
- Updated CLI and runtime to support --debug flag
- Made debug traces conditional on DEBUG environment variable
- Added debug mode for container and AgentCore deployments
Enhanced build and deployment script:
- Added command line argument for ECR repository name
- Added help documentation and usage examples
- Added support for local builds (x86_64) vs AgentCore builds (arm64)
- Added environment variable pass-through for DEBUG, LLM_PROVIDER, ANTHROPIC_API_KEY
Created comprehensive deployment guide:
- Step-by-step instructions from local testing to production
- Docker platform documentation (x86_64 vs arm64)
- Environment variable configuration with .env file usage
- Debug mode examples and troubleshooting guide
- Provider configuration for Bedrock and Anthropic
Updated README with AgentCore Runtime deployment section and documentation links.
* docs: Update SRE Agent README with deployment flow diagram and fix directory reference
- Fix reference from 04-SRE-agent to SRE-agent in README
- Add comprehensive flowchart showing development to production deployment flow
- Update overview to mention Amazon Bedrock AgentCore Runtime deployment
- Remove emojis from documentation for professional appearance
* docs: Replace mermaid diagram with ASCII step-by-step flow diagram
- Change from block-style mermaid diagram to ASCII flow diagram
- Show clear step-by-step progression from development to production
- Improve readability with structured boxes and arrows
- Minor text improvements for clarity
* feat: Implement comprehensive prompt management system and enhance deployment guide
- Create centralized prompt template system with external files in config/prompts/
- Add PromptLoader utility class with LRU caching and template variable substitution
- Integrate PromptConfig into SREConstants for centralized configuration management
- Update all agents (nodes, supervisor, output_formatter) to use prompt loader
- Replace 150+ lines of hardcoded prompts with modular, maintainable template system
- Enhance deployment guide with consistent naming (my_custom_sre_agent) throughout
- Add quick-start copy-paste command sequence for streamlined deployment
- Improve constants system with comprehensive model, AWS, timeout, and prompt configs
- Add architectural assessment document to .gitignore for local analysis
- Run black formatting across all updated Python files
* docs: Consolidate deployment and security documentation
- Rename deployment-and-security.md to security.md and remove redundant deployment content
- Enhance security.md with comprehensive production security guidelines including:
- Authentication and authorization best practices
- Encryption and data protection requirements
- Operational security monitoring and logging
- Input validation and prompt security measures
- Infrastructure security recommendations
- Compliance and governance frameworks
- Update README.md to reference new security.md file
- Eliminate redundancy between deployment-guide.md and deployment-and-security.md
- Improve documentation organization with clear separation of concerns
* config: Replace hardcoded endpoints with placeholder domains
- Update OpenAPI specifications to use placeholder domain 'your-backend-domain.com'
- k8s_api.yaml: mcpgateway.ddns.net:8011 -> your-backend-domain.com:8011
- logs_api.yaml: mcpgateway.ddns.net:8012 -> your-backend-domain.com:8012
- metrics_api.yaml: mcpgateway.ddns.net:8013 -> your-backend-domain.com:8013
- runbooks_api.yaml: mcpgateway.ddns.net:8014 -> your-backend-domain.com:8014
- Update agent configuration to use placeholder AgentCore gateway endpoint
- agent_config.yaml: Replace specific gateway ID with 'your-agentcore-gateway-endpoint'
- Improve security by removing hardcoded production endpoints from repository
- Enable template-based configuration that users can customize during setup
- Align with existing documentation patterns for placeholder domain replacement
2025-07-27 15:05:03 -04:00
|
|
|
auto_approve_plan = state.get("auto_approve_plan", False) or False
|
|
|
|
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
# Use the user_preferences we already retrieved
|
|
|
|
user_preferences_json = (
|
|
|
|
json.dumps(user_preferences, indent=2, default=_json_serializer)
|
|
|
|
if user_preferences
|
|
|
|
else ""
|
|
|
|
)
|
|
|
|
|
fix(SRE Agent)- Deploy SRE Agent on Amazon Bedrock AgentCore Runtime with Enhanced Architecture (#158)
* feat: Deploy SRE agent on Amazon Bedrock AgentCore Runtime
- Add agent_runtime.py with FastAPI endpoints for AgentCore compatibility
- Create Dockerfile for ARM64-based containerization
- Add deployment scripts for automated ECR push and AgentCore deployment
- Update backend API URLs from placeholders to actual endpoints
- Update gateway configuration for production use
- Add dependencies for AgentCore runtime support
Implements #143
* chore: Add deployment artifacts to .gitignore
- Add deployment/.sre_agent_uri, deployment/.env, and deployment/.agent_arn to .gitignore
- Remove already tracked deployment artifacts from git
* feat: Make ANTHROPIC_API_KEY optional in deployment
- Update deploy_agent_runtime.py to conditionally include ANTHROPIC_API_KEY
- Show info message when using Amazon Bedrock as provider
- Update .env.example to clarify ANTHROPIC_API_KEY is optional
- Only include ANTHROPIC_API_KEY in environment variables if it exists
* fix: Use uv run python instead of python in build script
- Update build_and_deploy.sh to use 'uv run python' for deployment
- Change to parent directory to ensure uv environment is available
- Fixes 'python: command not found' error during deployment
* refactor: Improve deployment script structure and create .env symlink
- Flatten nested if-else blocks in deploy_agent_runtime.py for better readability
- Add 10-second sleep after deletion to ensure cleanup completes
- Create symlink from deployment/.env to sre_agent/.env to avoid duplication
- Move time import to top of file with other imports
* feat: Add debug mode support and comprehensive deployment guide
Add --debug command line flag and DEBUG environment variable support:
- Created shared logging configuration module
- Updated CLI and runtime to support --debug flag
- Made debug traces conditional on DEBUG environment variable
- Added debug mode for container and AgentCore deployments
Enhanced build and deployment script:
- Added command line argument for ECR repository name
- Added help documentation and usage examples
- Added support for local builds (x86_64) vs AgentCore builds (arm64)
- Added environment variable pass-through for DEBUG, LLM_PROVIDER, ANTHROPIC_API_KEY
Created comprehensive deployment guide:
- Step-by-step instructions from local testing to production
- Docker platform documentation (x86_64 vs arm64)
- Environment variable configuration with .env file usage
- Debug mode examples and troubleshooting guide
- Provider configuration for Bedrock and Anthropic
Updated README with AgentCore Runtime deployment section and documentation links.
* docs: Update SRE Agent README with deployment flow diagram and fix directory reference
- Fix reference from 04-SRE-agent to SRE-agent in README
- Add comprehensive flowchart showing development to production deployment flow
- Update overview to mention Amazon Bedrock AgentCore Runtime deployment
- Remove emojis from documentation for professional appearance
* docs: Replace mermaid diagram with ASCII step-by-step flow diagram
- Change from block-style mermaid diagram to ASCII flow diagram
- Show clear step-by-step progression from development to production
- Improve readability with structured boxes and arrows
- Minor text improvements for clarity
* feat: Implement comprehensive prompt management system and enhance deployment guide
- Create centralized prompt template system with external files in config/prompts/
- Add PromptLoader utility class with LRU caching and template variable substitution
- Integrate PromptConfig into SREConstants for centralized configuration management
- Update all agents (nodes, supervisor, output_formatter) to use prompt loader
- Replace 150+ lines of hardcoded prompts with modular, maintainable template system
- Enhance deployment guide with consistent naming (my_custom_sre_agent) throughout
- Add quick-start copy-paste command sequence for streamlined deployment
- Improve constants system with comprehensive model, AWS, timeout, and prompt configs
- Add architectural assessment document to .gitignore for local analysis
- Run black formatting across all updated Python files
* docs: Consolidate deployment and security documentation
- Rename deployment-and-security.md to security.md and remove redundant deployment content
- Enhance security.md with comprehensive production security guidelines including:
- Authentication and authorization best practices
- Encryption and data protection requirements
- Operational security monitoring and logging
- Input validation and prompt security measures
- Infrastructure security recommendations
- Compliance and governance frameworks
- Update README.md to reference new security.md file
- Eliminate redundancy between deployment-guide.md and deployment-and-security.md
- Improve documentation organization with clear separation of concerns
* config: Replace hardcoded endpoints with placeholder domains
- Update OpenAPI specifications to use placeholder domain 'your-backend-domain.com'
- k8s_api.yaml: mcpgateway.ddns.net:8011 -> your-backend-domain.com:8011
- logs_api.yaml: mcpgateway.ddns.net:8012 -> your-backend-domain.com:8012
- metrics_api.yaml: mcpgateway.ddns.net:8013 -> your-backend-domain.com:8013
- runbooks_api.yaml: mcpgateway.ddns.net:8014 -> your-backend-domain.com:8014
- Update agent configuration to use placeholder AgentCore gateway endpoint
- agent_config.yaml: Replace specific gateway ID with 'your-agentcore-gateway-endpoint'
- Improve security by removing hardcoded production endpoints from repository
- Enable template-based configuration that users can customize during setup
- Align with existing documentation patterns for placeholder domain replacement
2025-07-27 15:05:03 -04:00
|
|
|
if is_plan_based:
|
|
|
|
current_step = metadata.get("plan_step", 0)
|
|
|
|
total_steps = len(plan.get("steps", []))
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
plan_json = json.dumps(
|
|
|
|
plan.get("steps", []), indent=2, default=_json_serializer
|
|
|
|
)
|
fix(SRE Agent)- Deploy SRE Agent on Amazon Bedrock AgentCore Runtime with Enhanced Architecture (#158)
* feat: Deploy SRE agent on Amazon Bedrock AgentCore Runtime
- Add agent_runtime.py with FastAPI endpoints for AgentCore compatibility
- Create Dockerfile for ARM64-based containerization
- Add deployment scripts for automated ECR push and AgentCore deployment
- Update backend API URLs from placeholders to actual endpoints
- Update gateway configuration for production use
- Add dependencies for AgentCore runtime support
Implements #143
* chore: Add deployment artifacts to .gitignore
- Add deployment/.sre_agent_uri, deployment/.env, and deployment/.agent_arn to .gitignore
- Remove already tracked deployment artifacts from git
* feat: Make ANTHROPIC_API_KEY optional in deployment
- Update deploy_agent_runtime.py to conditionally include ANTHROPIC_API_KEY
- Show info message when using Amazon Bedrock as provider
- Update .env.example to clarify ANTHROPIC_API_KEY is optional
- Only include ANTHROPIC_API_KEY in environment variables if it exists
* fix: Use uv run python instead of python in build script
- Update build_and_deploy.sh to use 'uv run python' for deployment
- Change to parent directory to ensure uv environment is available
- Fixes 'python: command not found' error during deployment
* refactor: Improve deployment script structure and create .env symlink
- Flatten nested if-else blocks in deploy_agent_runtime.py for better readability
- Add 10-second sleep after deletion to ensure cleanup completes
- Create symlink from deployment/.env to sre_agent/.env to avoid duplication
- Move time import to top of file with other imports
* feat: Add debug mode support and comprehensive deployment guide
Add --debug command line flag and DEBUG environment variable support:
- Created shared logging configuration module
- Updated CLI and runtime to support --debug flag
- Made debug traces conditional on DEBUG environment variable
- Added debug mode for container and AgentCore deployments
Enhanced build and deployment script:
- Added command line argument for ECR repository name
- Added help documentation and usage examples
- Added support for local builds (x86_64) vs AgentCore builds (arm64)
- Added environment variable pass-through for DEBUG, LLM_PROVIDER, ANTHROPIC_API_KEY
Created comprehensive deployment guide:
- Step-by-step instructions from local testing to production
- Docker platform documentation (x86_64 vs arm64)
- Environment variable configuration with .env file usage
- Debug mode examples and troubleshooting guide
- Provider configuration for Bedrock and Anthropic
Updated README with AgentCore Runtime deployment section and documentation links.
* docs: Update SRE Agent README with deployment flow diagram and fix directory reference
- Fix reference from 04-SRE-agent to SRE-agent in README
- Add comprehensive flowchart showing development to production deployment flow
- Update overview to mention Amazon Bedrock AgentCore Runtime deployment
- Remove emojis from documentation for professional appearance
* docs: Replace mermaid diagram with ASCII step-by-step flow diagram
- Change from block-style mermaid diagram to ASCII flow diagram
- Show clear step-by-step progression from development to production
- Improve readability with structured boxes and arrows
- Minor text improvements for clarity
* feat: Implement comprehensive prompt management system and enhance deployment guide
- Create centralized prompt template system with external files in config/prompts/
- Add PromptLoader utility class with LRU caching and template variable substitution
- Integrate PromptConfig into SREConstants for centralized configuration management
- Update all agents (nodes, supervisor, output_formatter) to use prompt loader
- Replace 150+ lines of hardcoded prompts with modular, maintainable template system
- Enhance deployment guide with consistent naming (my_custom_sre_agent) throughout
- Add quick-start copy-paste command sequence for streamlined deployment
- Improve constants system with comprehensive model, AWS, timeout, and prompt configs
- Add architectural assessment document to .gitignore for local analysis
- Run black formatting across all updated Python files
* docs: Consolidate deployment and security documentation
- Rename deployment-and-security.md to security.md and remove redundant deployment content
- Enhance security.md with comprehensive production security guidelines including:
- Authentication and authorization best practices
- Encryption and data protection requirements
- Operational security monitoring and logging
- Input validation and prompt security measures
- Infrastructure security recommendations
- Compliance and governance frameworks
- Update README.md to reference new security.md file
- Eliminate redundancy between deployment-guide.md and deployment-and-security.md
- Improve documentation organization with clear separation of concerns
* config: Replace hardcoded endpoints with placeholder domains
- Update OpenAPI specifications to use placeholder domain 'your-backend-domain.com'
- k8s_api.yaml: mcpgateway.ddns.net:8011 -> your-backend-domain.com:8011
- logs_api.yaml: mcpgateway.ddns.net:8012 -> your-backend-domain.com:8012
- metrics_api.yaml: mcpgateway.ddns.net:8013 -> your-backend-domain.com:8013
- runbooks_api.yaml: mcpgateway.ddns.net:8014 -> your-backend-domain.com:8014
- Update agent configuration to use placeholder AgentCore gateway endpoint
- agent_config.yaml: Replace specific gateway ID with 'your-agentcore-gateway-endpoint'
- Improve security by removing hardcoded production endpoints from repository
- Enable template-based configuration that users can customize during setup
- Align with existing documentation patterns for placeholder domain replacement
2025-07-27 15:05:03 -04:00
|
|
|
|
|
|
|
aggregation_prompt = (
|
|
|
|
prompt_loader.get_supervisor_aggregation_prompt(
|
|
|
|
is_plan_based=True,
|
|
|
|
query=query,
|
|
|
|
agent_results=agent_results_json,
|
|
|
|
auto_approve_plan=auto_approve_plan,
|
|
|
|
current_step=current_step + 1,
|
|
|
|
total_steps=total_steps,
|
|
|
|
plan=plan_json,
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
user_preferences=user_preferences_json,
|
fix(SRE Agent)- Deploy SRE Agent on Amazon Bedrock AgentCore Runtime with Enhanced Architecture (#158)
* feat: Deploy SRE agent on Amazon Bedrock AgentCore Runtime
- Add agent_runtime.py with FastAPI endpoints for AgentCore compatibility
- Create Dockerfile for ARM64-based containerization
- Add deployment scripts for automated ECR push and AgentCore deployment
- Update backend API URLs from placeholders to actual endpoints
- Update gateway configuration for production use
- Add dependencies for AgentCore runtime support
Implements #143
* chore: Add deployment artifacts to .gitignore
- Add deployment/.sre_agent_uri, deployment/.env, and deployment/.agent_arn to .gitignore
- Remove already tracked deployment artifacts from git
* feat: Make ANTHROPIC_API_KEY optional in deployment
- Update deploy_agent_runtime.py to conditionally include ANTHROPIC_API_KEY
- Show info message when using Amazon Bedrock as provider
- Update .env.example to clarify ANTHROPIC_API_KEY is optional
- Only include ANTHROPIC_API_KEY in environment variables if it exists
* fix: Use uv run python instead of python in build script
- Update build_and_deploy.sh to use 'uv run python' for deployment
- Change to parent directory to ensure uv environment is available
- Fixes 'python: command not found' error during deployment
* refactor: Improve deployment script structure and create .env symlink
- Flatten nested if-else blocks in deploy_agent_runtime.py for better readability
- Add 10-second sleep after deletion to ensure cleanup completes
- Create symlink from deployment/.env to sre_agent/.env to avoid duplication
- Move time import to top of file with other imports
* feat: Add debug mode support and comprehensive deployment guide
Add --debug command line flag and DEBUG environment variable support:
- Created shared logging configuration module
- Updated CLI and runtime to support --debug flag
- Made debug traces conditional on DEBUG environment variable
- Added debug mode for container and AgentCore deployments
Enhanced build and deployment script:
- Added command line argument for ECR repository name
- Added help documentation and usage examples
- Added support for local builds (x86_64) vs AgentCore builds (arm64)
- Added environment variable pass-through for DEBUG, LLM_PROVIDER, ANTHROPIC_API_KEY
Created comprehensive deployment guide:
- Step-by-step instructions from local testing to production
- Docker platform documentation (x86_64 vs arm64)
- Environment variable configuration with .env file usage
- Debug mode examples and troubleshooting guide
- Provider configuration for Bedrock and Anthropic
Updated README with AgentCore Runtime deployment section and documentation links.
* docs: Update SRE Agent README with deployment flow diagram and fix directory reference
- Fix reference from 04-SRE-agent to SRE-agent in README
- Add comprehensive flowchart showing development to production deployment flow
- Update overview to mention Amazon Bedrock AgentCore Runtime deployment
- Remove emojis from documentation for professional appearance
* docs: Replace mermaid diagram with ASCII step-by-step flow diagram
- Change from block-style mermaid diagram to ASCII flow diagram
- Show clear step-by-step progression from development to production
- Improve readability with structured boxes and arrows
- Minor text improvements for clarity
* feat: Implement comprehensive prompt management system and enhance deployment guide
- Create centralized prompt template system with external files in config/prompts/
- Add PromptLoader utility class with LRU caching and template variable substitution
- Integrate PromptConfig into SREConstants for centralized configuration management
- Update all agents (nodes, supervisor, output_formatter) to use prompt loader
- Replace 150+ lines of hardcoded prompts with modular, maintainable template system
- Enhance deployment guide with consistent naming (my_custom_sre_agent) throughout
- Add quick-start copy-paste command sequence for streamlined deployment
- Improve constants system with comprehensive model, AWS, timeout, and prompt configs
- Add architectural assessment document to .gitignore for local analysis
- Run black formatting across all updated Python files
* docs: Consolidate deployment and security documentation
- Rename deployment-and-security.md to security.md and remove redundant deployment content
- Enhance security.md with comprehensive production security guidelines including:
- Authentication and authorization best practices
- Encryption and data protection requirements
- Operational security monitoring and logging
- Input validation and prompt security measures
- Infrastructure security recommendations
- Compliance and governance frameworks
- Update README.md to reference new security.md file
- Eliminate redundancy between deployment-guide.md and deployment-and-security.md
- Improve documentation organization with clear separation of concerns
* config: Replace hardcoded endpoints with placeholder domains
- Update OpenAPI specifications to use placeholder domain 'your-backend-domain.com'
- k8s_api.yaml: mcpgateway.ddns.net:8011 -> your-backend-domain.com:8011
- logs_api.yaml: mcpgateway.ddns.net:8012 -> your-backend-domain.com:8012
- metrics_api.yaml: mcpgateway.ddns.net:8013 -> your-backend-domain.com:8013
- runbooks_api.yaml: mcpgateway.ddns.net:8014 -> your-backend-domain.com:8014
- Update agent configuration to use placeholder AgentCore gateway endpoint
- agent_config.yaml: Replace specific gateway ID with 'your-agentcore-gateway-endpoint'
- Improve security by removing hardcoded production endpoints from repository
- Enable template-based configuration that users can customize during setup
- Align with existing documentation patterns for placeholder domain replacement
2025-07-27 15:05:03 -04:00
|
|
|
)
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
aggregation_prompt = (
|
|
|
|
prompt_loader.get_supervisor_aggregation_prompt(
|
|
|
|
is_plan_based=False,
|
|
|
|
query=query,
|
|
|
|
agent_results=agent_results_json,
|
|
|
|
auto_approve_plan=auto_approve_plan,
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
user_preferences=user_preferences_json,
|
fix(SRE Agent)- Deploy SRE Agent on Amazon Bedrock AgentCore Runtime with Enhanced Architecture (#158)
* feat: Deploy SRE agent on Amazon Bedrock AgentCore Runtime
- Add agent_runtime.py with FastAPI endpoints for AgentCore compatibility
- Create Dockerfile for ARM64-based containerization
- Add deployment scripts for automated ECR push and AgentCore deployment
- Update backend API URLs from placeholders to actual endpoints
- Update gateway configuration for production use
- Add dependencies for AgentCore runtime support
Implements #143
* chore: Add deployment artifacts to .gitignore
- Add deployment/.sre_agent_uri, deployment/.env, and deployment/.agent_arn to .gitignore
- Remove already tracked deployment artifacts from git
* feat: Make ANTHROPIC_API_KEY optional in deployment
- Update deploy_agent_runtime.py to conditionally include ANTHROPIC_API_KEY
- Show info message when using Amazon Bedrock as provider
- Update .env.example to clarify ANTHROPIC_API_KEY is optional
- Only include ANTHROPIC_API_KEY in environment variables if it exists
* fix: Use uv run python instead of python in build script
- Update build_and_deploy.sh to use 'uv run python' for deployment
- Change to parent directory to ensure uv environment is available
- Fixes 'python: command not found' error during deployment
* refactor: Improve deployment script structure and create .env symlink
- Flatten nested if-else blocks in deploy_agent_runtime.py for better readability
- Add 10-second sleep after deletion to ensure cleanup completes
- Create symlink from deployment/.env to sre_agent/.env to avoid duplication
- Move time import to top of file with other imports
* feat: Add debug mode support and comprehensive deployment guide
Add --debug command line flag and DEBUG environment variable support:
- Created shared logging configuration module
- Updated CLI and runtime to support --debug flag
- Made debug traces conditional on DEBUG environment variable
- Added debug mode for container and AgentCore deployments
Enhanced build and deployment script:
- Added command line argument for ECR repository name
- Added help documentation and usage examples
- Added support for local builds (x86_64) vs AgentCore builds (arm64)
- Added environment variable pass-through for DEBUG, LLM_PROVIDER, ANTHROPIC_API_KEY
Created comprehensive deployment guide:
- Step-by-step instructions from local testing to production
- Docker platform documentation (x86_64 vs arm64)
- Environment variable configuration with .env file usage
- Debug mode examples and troubleshooting guide
- Provider configuration for Bedrock and Anthropic
Updated README with AgentCore Runtime deployment section and documentation links.
* docs: Update SRE Agent README with deployment flow diagram and fix directory reference
- Fix reference from 04-SRE-agent to SRE-agent in README
- Add comprehensive flowchart showing development to production deployment flow
- Update overview to mention Amazon Bedrock AgentCore Runtime deployment
- Remove emojis from documentation for professional appearance
* docs: Replace mermaid diagram with ASCII step-by-step flow diagram
- Change from block-style mermaid diagram to ASCII flow diagram
- Show clear step-by-step progression from development to production
- Improve readability with structured boxes and arrows
- Minor text improvements for clarity
* feat: Implement comprehensive prompt management system and enhance deployment guide
- Create centralized prompt template system with external files in config/prompts/
- Add PromptLoader utility class with LRU caching and template variable substitution
- Integrate PromptConfig into SREConstants for centralized configuration management
- Update all agents (nodes, supervisor, output_formatter) to use prompt loader
- Replace 150+ lines of hardcoded prompts with modular, maintainable template system
- Enhance deployment guide with consistent naming (my_custom_sre_agent) throughout
- Add quick-start copy-paste command sequence for streamlined deployment
- Improve constants system with comprehensive model, AWS, timeout, and prompt configs
- Add architectural assessment document to .gitignore for local analysis
- Run black formatting across all updated Python files
* docs: Consolidate deployment and security documentation
- Rename deployment-and-security.md to security.md and remove redundant deployment content
- Enhance security.md with comprehensive production security guidelines including:
- Authentication and authorization best practices
- Encryption and data protection requirements
- Operational security monitoring and logging
- Input validation and prompt security measures
- Infrastructure security recommendations
- Compliance and governance frameworks
- Update README.md to reference new security.md file
- Eliminate redundancy between deployment-guide.md and deployment-and-security.md
- Improve documentation organization with clear separation of concerns
* config: Replace hardcoded endpoints with placeholder domains
- Update OpenAPI specifications to use placeholder domain 'your-backend-domain.com'
- k8s_api.yaml: mcpgateway.ddns.net:8011 -> your-backend-domain.com:8011
- logs_api.yaml: mcpgateway.ddns.net:8012 -> your-backend-domain.com:8012
- metrics_api.yaml: mcpgateway.ddns.net:8013 -> your-backend-domain.com:8013
- runbooks_api.yaml: mcpgateway.ddns.net:8014 -> your-backend-domain.com:8014
- Update agent configuration to use placeholder AgentCore gateway endpoint
- agent_config.yaml: Replace specific gateway ID with 'your-agentcore-gateway-endpoint'
- Improve security by removing hardcoded production endpoints from repository
- Enable template-based configuration that users can customize during setup
- Align with existing documentation patterns for placeholder domain replacement
2025-07-27 15:05:03 -04:00
|
|
|
)
|
|
|
|
)
|
2025-07-16 14:07:30 -04:00
|
|
|
|
fix(SRE Agent)- Deploy SRE Agent on Amazon Bedrock AgentCore Runtime with Enhanced Architecture (#158)
* feat: Deploy SRE agent on Amazon Bedrock AgentCore Runtime
- Add agent_runtime.py with FastAPI endpoints for AgentCore compatibility
- Create Dockerfile for ARM64-based containerization
- Add deployment scripts for automated ECR push and AgentCore deployment
- Update backend API URLs from placeholders to actual endpoints
- Update gateway configuration for production use
- Add dependencies for AgentCore runtime support
Implements #143
* chore: Add deployment artifacts to .gitignore
- Add deployment/.sre_agent_uri, deployment/.env, and deployment/.agent_arn to .gitignore
- Remove already tracked deployment artifacts from git
* feat: Make ANTHROPIC_API_KEY optional in deployment
- Update deploy_agent_runtime.py to conditionally include ANTHROPIC_API_KEY
- Show info message when using Amazon Bedrock as provider
- Update .env.example to clarify ANTHROPIC_API_KEY is optional
- Only include ANTHROPIC_API_KEY in environment variables if it exists
* fix: Use uv run python instead of python in build script
- Update build_and_deploy.sh to use 'uv run python' for deployment
- Change to parent directory to ensure uv environment is available
- Fixes 'python: command not found' error during deployment
* refactor: Improve deployment script structure and create .env symlink
- Flatten nested if-else blocks in deploy_agent_runtime.py for better readability
- Add 10-second sleep after deletion to ensure cleanup completes
- Create symlink from deployment/.env to sre_agent/.env to avoid duplication
- Move time import to top of file with other imports
* feat: Add debug mode support and comprehensive deployment guide
Add --debug command line flag and DEBUG environment variable support:
- Created shared logging configuration module
- Updated CLI and runtime to support --debug flag
- Made debug traces conditional on DEBUG environment variable
- Added debug mode for container and AgentCore deployments
Enhanced build and deployment script:
- Added command line argument for ECR repository name
- Added help documentation and usage examples
- Added support for local builds (x86_64) vs AgentCore builds (arm64)
- Added environment variable pass-through for DEBUG, LLM_PROVIDER, ANTHROPIC_API_KEY
Created comprehensive deployment guide:
- Step-by-step instructions from local testing to production
- Docker platform documentation (x86_64 vs arm64)
- Environment variable configuration with .env file usage
- Debug mode examples and troubleshooting guide
- Provider configuration for Bedrock and Anthropic
Updated README with AgentCore Runtime deployment section and documentation links.
* docs: Update SRE Agent README with deployment flow diagram and fix directory reference
- Fix reference from 04-SRE-agent to SRE-agent in README
- Add comprehensive flowchart showing development to production deployment flow
- Update overview to mention Amazon Bedrock AgentCore Runtime deployment
- Remove emojis from documentation for professional appearance
* docs: Replace mermaid diagram with ASCII step-by-step flow diagram
- Change from block-style mermaid diagram to ASCII flow diagram
- Show clear step-by-step progression from development to production
- Improve readability with structured boxes and arrows
- Minor text improvements for clarity
* feat: Implement comprehensive prompt management system and enhance deployment guide
- Create centralized prompt template system with external files in config/prompts/
- Add PromptLoader utility class with LRU caching and template variable substitution
- Integrate PromptConfig into SREConstants for centralized configuration management
- Update all agents (nodes, supervisor, output_formatter) to use prompt loader
- Replace 150+ lines of hardcoded prompts with modular, maintainable template system
- Enhance deployment guide with consistent naming (my_custom_sre_agent) throughout
- Add quick-start copy-paste command sequence for streamlined deployment
- Improve constants system with comprehensive model, AWS, timeout, and prompt configs
- Add architectural assessment document to .gitignore for local analysis
- Run black formatting across all updated Python files
* docs: Consolidate deployment and security documentation
- Rename deployment-and-security.md to security.md and remove redundant deployment content
- Enhance security.md with comprehensive production security guidelines including:
- Authentication and authorization best practices
- Encryption and data protection requirements
- Operational security monitoring and logging
- Input validation and prompt security measures
- Infrastructure security recommendations
- Compliance and governance frameworks
- Update README.md to reference new security.md file
- Eliminate redundancy between deployment-guide.md and deployment-and-security.md
- Improve documentation organization with clear separation of concerns
* config: Replace hardcoded endpoints with placeholder domains
- Update OpenAPI specifications to use placeholder domain 'your-backend-domain.com'
- k8s_api.yaml: mcpgateway.ddns.net:8011 -> your-backend-domain.com:8011
- logs_api.yaml: mcpgateway.ddns.net:8012 -> your-backend-domain.com:8012
- metrics_api.yaml: mcpgateway.ddns.net:8013 -> your-backend-domain.com:8013
- runbooks_api.yaml: mcpgateway.ddns.net:8014 -> your-backend-domain.com:8014
- Update agent configuration to use placeholder AgentCore gateway endpoint
- agent_config.yaml: Replace specific gateway ID with 'your-agentcore-gateway-endpoint'
- Improve security by removing hardcoded production endpoints from repository
- Enable template-based configuration that users can customize during setup
- Align with existing documentation patterns for placeholder domain replacement
2025-07-27 15:05:03 -04:00
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Error loading aggregation prompts: {e}")
|
|
|
|
# Fallback to simple prompt
|
|
|
|
system_prompt = "You are an expert at presenting technical investigation results clearly and professionally."
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
aggregation_prompt = f"Summarize these findings: {json.dumps(agent_results, indent=2, default=_json_serializer)}"
|
2025-07-16 14:07:30 -04:00
|
|
|
|
|
|
|
response = await self.llm.ainvoke(
|
|
|
|
[
|
fix(SRE Agent)- Deploy SRE Agent on Amazon Bedrock AgentCore Runtime with Enhanced Architecture (#158)
* feat: Deploy SRE agent on Amazon Bedrock AgentCore Runtime
- Add agent_runtime.py with FastAPI endpoints for AgentCore compatibility
- Create Dockerfile for ARM64-based containerization
- Add deployment scripts for automated ECR push and AgentCore deployment
- Update backend API URLs from placeholders to actual endpoints
- Update gateway configuration for production use
- Add dependencies for AgentCore runtime support
Implements #143
* chore: Add deployment artifacts to .gitignore
- Add deployment/.sre_agent_uri, deployment/.env, and deployment/.agent_arn to .gitignore
- Remove already tracked deployment artifacts from git
* feat: Make ANTHROPIC_API_KEY optional in deployment
- Update deploy_agent_runtime.py to conditionally include ANTHROPIC_API_KEY
- Show info message when using Amazon Bedrock as provider
- Update .env.example to clarify ANTHROPIC_API_KEY is optional
- Only include ANTHROPIC_API_KEY in environment variables if it exists
* fix: Use uv run python instead of python in build script
- Update build_and_deploy.sh to use 'uv run python' for deployment
- Change to parent directory to ensure uv environment is available
- Fixes 'python: command not found' error during deployment
* refactor: Improve deployment script structure and create .env symlink
- Flatten nested if-else blocks in deploy_agent_runtime.py for better readability
- Add 10-second sleep after deletion to ensure cleanup completes
- Create symlink from deployment/.env to sre_agent/.env to avoid duplication
- Move time import to top of file with other imports
* feat: Add debug mode support and comprehensive deployment guide
Add --debug command line flag and DEBUG environment variable support:
- Created shared logging configuration module
- Updated CLI and runtime to support --debug flag
- Made debug traces conditional on DEBUG environment variable
- Added debug mode for container and AgentCore deployments
Enhanced build and deployment script:
- Added command line argument for ECR repository name
- Added help documentation and usage examples
- Added support for local builds (x86_64) vs AgentCore builds (arm64)
- Added environment variable pass-through for DEBUG, LLM_PROVIDER, ANTHROPIC_API_KEY
Created comprehensive deployment guide:
- Step-by-step instructions from local testing to production
- Docker platform documentation (x86_64 vs arm64)
- Environment variable configuration with .env file usage
- Debug mode examples and troubleshooting guide
- Provider configuration for Bedrock and Anthropic
Updated README with AgentCore Runtime deployment section and documentation links.
* docs: Update SRE Agent README with deployment flow diagram and fix directory reference
- Fix reference from 04-SRE-agent to SRE-agent in README
- Add comprehensive flowchart showing development to production deployment flow
- Update overview to mention Amazon Bedrock AgentCore Runtime deployment
- Remove emojis from documentation for professional appearance
* docs: Replace mermaid diagram with ASCII step-by-step flow diagram
- Change from block-style mermaid diagram to ASCII flow diagram
- Show clear step-by-step progression from development to production
- Improve readability with structured boxes and arrows
- Minor text improvements for clarity
* feat: Implement comprehensive prompt management system and enhance deployment guide
- Create centralized prompt template system with external files in config/prompts/
- Add PromptLoader utility class with LRU caching and template variable substitution
- Integrate PromptConfig into SREConstants for centralized configuration management
- Update all agents (nodes, supervisor, output_formatter) to use prompt loader
- Replace 150+ lines of hardcoded prompts with modular, maintainable template system
- Enhance deployment guide with consistent naming (my_custom_sre_agent) throughout
- Add quick-start copy-paste command sequence for streamlined deployment
- Improve constants system with comprehensive model, AWS, timeout, and prompt configs
- Add architectural assessment document to .gitignore for local analysis
- Run black formatting across all updated Python files
* docs: Consolidate deployment and security documentation
- Rename deployment-and-security.md to security.md and remove redundant deployment content
- Enhance security.md with comprehensive production security guidelines including:
- Authentication and authorization best practices
- Encryption and data protection requirements
- Operational security monitoring and logging
- Input validation and prompt security measures
- Infrastructure security recommendations
- Compliance and governance frameworks
- Update README.md to reference new security.md file
- Eliminate redundancy between deployment-guide.md and deployment-and-security.md
- Improve documentation organization with clear separation of concerns
* config: Replace hardcoded endpoints with placeholder domains
- Update OpenAPI specifications to use placeholder domain 'your-backend-domain.com'
- k8s_api.yaml: mcpgateway.ddns.net:8011 -> your-backend-domain.com:8011
- logs_api.yaml: mcpgateway.ddns.net:8012 -> your-backend-domain.com:8012
- metrics_api.yaml: mcpgateway.ddns.net:8013 -> your-backend-domain.com:8013
- runbooks_api.yaml: mcpgateway.ddns.net:8014 -> your-backend-domain.com:8014
- Update agent configuration to use placeholder AgentCore gateway endpoint
- agent_config.yaml: Replace specific gateway ID with 'your-agentcore-gateway-endpoint'
- Improve security by removing hardcoded production endpoints from repository
- Enable template-based configuration that users can customize during setup
- Align with existing documentation patterns for placeholder domain replacement
2025-07-27 15:05:03 -04:00
|
|
|
SystemMessage(content=system_prompt),
|
2025-07-16 14:07:30 -04:00
|
|
|
HumanMessage(content=aggregation_prompt),
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
|
|
|
final_response = response.content
|
|
|
|
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
# Store final response conversation in memory
|
|
|
|
user_id = state.get("user_id")
|
|
|
|
session_id = state.get("session_id")
|
|
|
|
if (
|
|
|
|
self.conversation_manager
|
|
|
|
and user_id
|
|
|
|
and session_id
|
|
|
|
and not metadata.get("plan_pending_approval")
|
|
|
|
):
|
|
|
|
try:
|
|
|
|
# Store the final aggregated response
|
|
|
|
# Get supervisor display name with fallback
|
|
|
|
supervisor_name = getattr(SREConstants.agents, "supervisor", None)
|
|
|
|
if supervisor_name:
|
|
|
|
supervisor_display_name = supervisor_name.display_name
|
|
|
|
else:
|
|
|
|
supervisor_display_name = "Supervisor Agent"
|
|
|
|
|
|
|
|
messages_to_store = [
|
|
|
|
(
|
|
|
|
f"[Agent: {supervisor_display_name}]\n{final_response}",
|
|
|
|
"ASSISTANT",
|
|
|
|
)
|
|
|
|
]
|
|
|
|
|
|
|
|
success = self.conversation_manager.store_conversation_batch(
|
|
|
|
messages=messages_to_store,
|
|
|
|
user_id=user_id,
|
|
|
|
session_id=session_id,
|
|
|
|
agent_name=supervisor_display_name,
|
|
|
|
)
|
|
|
|
|
|
|
|
if success:
|
|
|
|
logger.info(
|
|
|
|
"Supervisor: Successfully stored final response conversation"
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
logger.warning(
|
|
|
|
"Supervisor: Failed to store final response conversation"
|
|
|
|
)
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
logger.error(
|
|
|
|
f"Supervisor: Error storing final response conversation: {e}",
|
|
|
|
exc_info=True,
|
|
|
|
)
|
|
|
|
|
|
|
|
# Save investigation summary to memory if enabled
|
|
|
|
if self.memory_client and not metadata.get("plan_pending_approval"):
|
|
|
|
try:
|
|
|
|
incident_id = state.get("incident_id", "auto-generated")
|
|
|
|
agents_used = state.get("agents_invoked", [])
|
|
|
|
logger.debug(
|
|
|
|
f"Saving investigation summary for incident_id={incident_id}, agents_used={agents_used}"
|
|
|
|
)
|
|
|
|
|
|
|
|
# Use user_id as actor_id for investigation summaries (consistent with conversation memory)
|
|
|
|
actor_id = state.get(
|
|
|
|
"user_id",
|
|
|
|
state.get("actor_id", SREConstants.agents.default_actor_id),
|
|
|
|
)
|
|
|
|
self.memory_hooks.on_investigation_complete(
|
|
|
|
state=state, final_response=final_response, actor_id=actor_id
|
|
|
|
)
|
|
|
|
logger.info(
|
|
|
|
f"Saved investigation summary to memory for incident {incident_id}"
|
|
|
|
)
|
|
|
|
except Exception as e:
|
|
|
|
logger.error(
|
|
|
|
f"Failed to save investigation summary: {e}", exc_info=True
|
|
|
|
)
|
|
|
|
|
2025-07-16 14:07:30 -04:00
|
|
|
return {"final_response": final_response, "next": "FINISH"}
|