Complete implementation of Agentic Context Engineering (ACE) framework: Core modules (optimization_engine/context/): - playbook.py: AtomizerPlaybook with helpful/harmful scoring - reflector.py: AtomizerReflector for insight extraction - session_state.py: Context isolation (exposed/isolated state) - feedback_loop.py: Automated learning from trial results - compaction.py: Long-session context management - cache_monitor.py: KV-cache optimization tracking - runner_integration.py: OptimizationRunner integration Dashboard integration: - context.py: 12 REST API endpoints for playbook management Tests: - test_context_engineering.py: 44 unit tests - test_context_integration.py: 16 integration tests Documentation: - CONTEXT_ENGINEERING_REPORT.md: Comprehensive implementation report - CONTEXT_ENGINEERING_API.md: Complete API reference - SYS_17_CONTEXT_ENGINEERING.md: System protocol - Updated cheatsheet with SYS_17 quick reference - Enhanced bootstrap (00_BOOTSTRAP_V2.md) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
464 lines
16 KiB
Python
464 lines
16 KiB
Python
"""
|
|
Integration test for full context engineering pipeline.
|
|
|
|
Tests the complete ACE (Agentic Context Engineering) workflow:
|
|
1. Starting fresh session
|
|
2. Running optimization with successes and failures
|
|
3. Verifying playbook learns from outcomes
|
|
4. Validating persistence across sessions
|
|
5. Testing context compaction under load
|
|
"""
|
|
|
|
import pytest
|
|
from pathlib import Path
|
|
import tempfile
|
|
import json
|
|
from datetime import datetime, timedelta
|
|
import random
|
|
|
|
from optimization_engine.context.playbook import AtomizerPlaybook, InsightCategory
|
|
from optimization_engine.context.reflector import AtomizerReflector, OptimizationOutcome
|
|
from optimization_engine.context.session_state import AtomizerSessionState, TaskType
|
|
from optimization_engine.context.feedback_loop import FeedbackLoop
|
|
from optimization_engine.context.compaction import CompactionManager, EventType
|
|
from optimization_engine.context.cache_monitor import ContextCacheOptimizer, StablePrefixBuilder
|
|
|
|
|
|
class TestFullOptimizationPipeline:
|
|
"""End-to-end test of optimization with context engineering."""
|
|
|
|
def test_complete_optimization_cycle(self, tmp_path):
|
|
"""
|
|
Simulates a complete optimization run:
|
|
1. Initialize context engineering
|
|
2. Process multiple trials (mix of success/failure)
|
|
3. Finalize and commit learning
|
|
4. Verify playbook has learned
|
|
"""
|
|
playbook_path = tmp_path / "playbook.json"
|
|
|
|
# Initialize feedback loop
|
|
feedback = FeedbackLoop(playbook_path)
|
|
|
|
# Simulate study with mixed results
|
|
trial_results = []
|
|
for i in range(20):
|
|
success = random.random() > 0.3 # 70% success rate
|
|
obj_value = 100 - i * 2 + random.uniform(-5, 5) if success else None
|
|
|
|
result = feedback.process_trial_result(
|
|
trial_number=i,
|
|
success=success,
|
|
objective_value=obj_value if success else 0.0,
|
|
design_variables={
|
|
"thickness": 0.5 + i * 0.1,
|
|
"width": 10 + i * 0.5
|
|
},
|
|
context_items_used=[],
|
|
errors=["convergence failure"] if not success else None
|
|
)
|
|
|
|
trial_results.append({
|
|
"trial": i,
|
|
"success": success,
|
|
"insights": result.get("insights_extracted", 0)
|
|
})
|
|
|
|
# Finalize study
|
|
successful = sum(1 for r in trial_results if r["success"])
|
|
final_result = feedback.finalize_study({
|
|
"name": "integration_test_study",
|
|
"total_trials": 20,
|
|
"best_value": min(
|
|
r.get("objective_value", float('inf'))
|
|
for r in trial_results if r["success"]
|
|
) if successful > 0 else 0,
|
|
"convergence_rate": successful / 20
|
|
})
|
|
|
|
# Verify learning occurred
|
|
assert final_result["insights_added"] > 0
|
|
assert final_result["playbook_size"] > 0
|
|
assert playbook_path.exists()
|
|
|
|
# Load and verify playbook content
|
|
playbook = AtomizerPlaybook.load(playbook_path)
|
|
|
|
# Should have some mistake insights from failures
|
|
mistakes = [
|
|
item for item in playbook.items.values()
|
|
if item.category == InsightCategory.MISTAKE
|
|
]
|
|
assert len(mistakes) > 0
|
|
|
|
def test_learning_persistence_across_sessions(self, tmp_path):
|
|
"""
|
|
Test that learning persists across multiple "sessions".
|
|
"""
|
|
playbook_path = tmp_path / "playbook.json"
|
|
|
|
# Session 1: Generate initial learning
|
|
feedback1 = FeedbackLoop(playbook_path)
|
|
for i in range(10):
|
|
feedback1.process_trial_result(
|
|
trial_number=i,
|
|
success=True,
|
|
objective_value=100 - i,
|
|
design_variables={"x": i}
|
|
)
|
|
feedback1.finalize_study({
|
|
"name": "session1",
|
|
"total_trials": 10,
|
|
"best_value": 91,
|
|
"convergence_rate": 1.0
|
|
})
|
|
|
|
# Verify session 1 created insights
|
|
pb1 = AtomizerPlaybook.load(playbook_path)
|
|
session1_items = len(pb1.items)
|
|
assert session1_items > 0
|
|
|
|
# Session 2: Continue learning
|
|
feedback2 = FeedbackLoop(playbook_path)
|
|
|
|
# Should have loaded existing playbook
|
|
assert len(feedback2.playbook.items) == session1_items
|
|
|
|
# Add more trials
|
|
for i in range(10, 20):
|
|
feedback2.process_trial_result(
|
|
trial_number=i,
|
|
success=i % 2 == 0,
|
|
objective_value=100 - i if i % 2 == 0 else 0.0,
|
|
design_variables={"x": i},
|
|
errors=["test error"] if i % 2 != 0 else None
|
|
)
|
|
feedback2.finalize_study({
|
|
"name": "session2",
|
|
"total_trials": 10,
|
|
"best_value": 80,
|
|
"convergence_rate": 0.5
|
|
})
|
|
|
|
# Verify combined learning
|
|
pb2 = AtomizerPlaybook.load(playbook_path)
|
|
assert len(pb2.items) >= session1_items # At least as many items
|
|
|
|
def test_playbook_pruning_over_time(self, tmp_path):
|
|
"""
|
|
Test that harmful insights get pruned.
|
|
"""
|
|
playbook_path = tmp_path / "playbook.json"
|
|
|
|
# Create playbook with a "bad" insight
|
|
playbook = AtomizerPlaybook()
|
|
bad_item = playbook.add_insight(
|
|
InsightCategory.STRATEGY,
|
|
"Use extremely coarse mesh" # Bad advice
|
|
)
|
|
|
|
# Give it many harmful outcomes
|
|
for _ in range(10):
|
|
playbook.record_outcome(bad_item.id, helpful=False)
|
|
|
|
playbook.save(playbook_path)
|
|
|
|
# Create feedback loop and finalize
|
|
feedback = FeedbackLoop(playbook_path)
|
|
|
|
# Process a few trials
|
|
for i in range(5):
|
|
feedback.process_trial_result(
|
|
trial_number=i,
|
|
success=True,
|
|
objective_value=100,
|
|
design_variables={}
|
|
)
|
|
|
|
feedback.finalize_study({
|
|
"name": "prune_test",
|
|
"total_trials": 5,
|
|
"best_value": 100,
|
|
"convergence_rate": 1.0
|
|
})
|
|
|
|
# Bad insight should be pruned (net_score -10 < threshold -3)
|
|
final_playbook = AtomizerPlaybook.load(playbook_path)
|
|
assert bad_item.id not in final_playbook.items
|
|
|
|
def test_context_compaction_under_load(self, tmp_path):
|
|
"""
|
|
Test that compaction works correctly under high trial volume.
|
|
"""
|
|
manager = CompactionManager(
|
|
compaction_threshold=20,
|
|
keep_recent=10,
|
|
keep_errors=True
|
|
)
|
|
|
|
# Simulate 100 trials
|
|
errors_added = 0
|
|
for i in range(100):
|
|
success = i % 5 != 0
|
|
|
|
if success:
|
|
manager.add_trial_event(
|
|
trial_number=i,
|
|
success=True,
|
|
objective=100 - i * 0.5,
|
|
duration=random.uniform(30, 120)
|
|
)
|
|
else:
|
|
manager.add_trial_event(
|
|
trial_number=i,
|
|
success=False,
|
|
duration=random.uniform(30, 120)
|
|
)
|
|
manager.add_error_event(
|
|
f"Error in trial {i}",
|
|
error_type="test_error"
|
|
)
|
|
errors_added += 1
|
|
|
|
# Should have compacted
|
|
stats = manager.get_stats()
|
|
assert stats["compaction_count"] > 0
|
|
|
|
# All errors should be preserved
|
|
assert stats["error_events"] == errors_added
|
|
|
|
# Total events should be bounded
|
|
assert stats["total_events"] < 100 # Compaction reduced count
|
|
|
|
# Context string should be reasonable length
|
|
context = manager.get_context_string()
|
|
assert len(context) < 50000 # Not too long
|
|
|
|
def test_session_state_throughout_optimization(self, tmp_path):
|
|
"""
|
|
Test session state tracking throughout an optimization.
|
|
"""
|
|
session = AtomizerSessionState(session_id="integration_test")
|
|
session.exposed.task_type = TaskType.RUN_OPTIMIZATION
|
|
session.exposed.study_name = "state_test"
|
|
|
|
# Simulate optimization progress
|
|
for i in range(20):
|
|
session.add_action(f"Processing trial {i}")
|
|
|
|
if i % 5 == 0 and i > 0:
|
|
session.update_study_status(
|
|
name="state_test",
|
|
status="running",
|
|
trials_completed=i,
|
|
trials_total=20,
|
|
best_value=100 - i,
|
|
best_trial=i
|
|
)
|
|
|
|
if i % 7 == 0:
|
|
session.add_error(f"Minor issue at trial {i}")
|
|
|
|
# Verify state
|
|
assert session.exposed.trials_completed == 15 # Last update at i=15
|
|
assert len(session.exposed.recent_errors) <= 5 # Bounded
|
|
|
|
# Context should include key information
|
|
context = session.get_llm_context()
|
|
assert "state_test" in context
|
|
assert "running" in context
|
|
|
|
def test_cache_optimization_effectiveness(self):
|
|
"""
|
|
Test that cache optimization actually works.
|
|
"""
|
|
optimizer = ContextCacheOptimizer()
|
|
|
|
# Build stable prefix (should be cached)
|
|
builder = StablePrefixBuilder()
|
|
builder.add_identity("I am Atomizer, an optimization assistant")
|
|
builder.add_capabilities("I can run FEA optimizations")
|
|
builder.add_tools("Available tools: NX, Nastran, Optuna")
|
|
stable_prefix = builder.build()
|
|
|
|
# Simulate 10 requests with same stable prefix
|
|
for i in range(10):
|
|
optimizer.prepare_context(
|
|
stable_prefix=stable_prefix,
|
|
semi_stable=f"Session info for request {i}",
|
|
dynamic=f"User message {i}"
|
|
)
|
|
|
|
# Should have high cache hit rate
|
|
assert optimizer.stats.hit_rate >= 0.9 # 9/10 hits
|
|
assert optimizer.stats.estimated_savings_percent >= 80 # Good savings
|
|
|
|
|
|
class TestReflectorLearningPatterns:
|
|
"""Test that the reflector extracts useful patterns."""
|
|
|
|
def test_convergence_pattern_learning(self, tmp_path):
|
|
"""Test learning from convergence failures."""
|
|
playbook = AtomizerPlaybook()
|
|
reflector = AtomizerReflector(playbook)
|
|
|
|
# Simulate convergence failures
|
|
for i in range(5):
|
|
outcome = OptimizationOutcome(
|
|
trial_number=i,
|
|
success=False,
|
|
objective_value=None,
|
|
solver_errors=["Convergence failure at iteration 100"],
|
|
design_variables={"x": i * 0.1},
|
|
duration_seconds=300
|
|
)
|
|
reflector.analyze_trial(outcome)
|
|
|
|
reflector.commit_insights()
|
|
|
|
# Should have learned about convergence issues
|
|
convergence_insights = [
|
|
item for item in playbook.items.values()
|
|
if "convergence" in item.content.lower()
|
|
]
|
|
assert len(convergence_insights) > 0
|
|
|
|
def test_success_pattern_learning(self, tmp_path):
|
|
"""Test learning from successful designs."""
|
|
playbook = AtomizerPlaybook()
|
|
reflector = AtomizerReflector(playbook)
|
|
|
|
# Simulate successful designs with similar characteristics
|
|
for i in range(5):
|
|
outcome = OptimizationOutcome(
|
|
trial_number=i,
|
|
success=True,
|
|
objective_value=50 + i,
|
|
design_variables={
|
|
"thickness": 1.0 + i * 0.1, # All around 1.0-1.5
|
|
"width": 10.0 # Consistent
|
|
},
|
|
duration_seconds=60
|
|
)
|
|
reflector.analyze_trial(outcome)
|
|
|
|
reflector.commit_insights()
|
|
|
|
# Should have learned success patterns
|
|
success_insights = [
|
|
item for item in playbook.items.values()
|
|
if item.category == InsightCategory.STRATEGY
|
|
]
|
|
assert len(success_insights) > 0
|
|
|
|
|
|
class TestErrorTrackerIntegration:
|
|
"""Test error tracker plugin integration."""
|
|
|
|
def test_error_classification(self):
|
|
"""Test error classification function."""
|
|
from optimization_engine.plugins.post_solve.error_tracker import classify_error
|
|
|
|
assert classify_error("Convergence failure at iteration 50") == "convergence_failure"
|
|
assert classify_error("Element distortion detected") == "mesh_error"
|
|
assert classify_error("Matrix singularity") == "singularity"
|
|
assert classify_error("Out of memory") == "memory_error"
|
|
assert classify_error("License checkout failed") == "license_error"
|
|
assert classify_error("Random unknown error") == "unknown_error"
|
|
|
|
def test_error_tracking_hook(self, tmp_path):
|
|
"""Test the error tracking hook function."""
|
|
from optimization_engine.plugins.post_solve.error_tracker import track_error
|
|
|
|
context = {
|
|
"trial_number": 5,
|
|
"working_dir": str(tmp_path),
|
|
"output_dir": str(tmp_path),
|
|
"solver_returncode": 1,
|
|
"error_message": "Convergence failure at iteration 100",
|
|
"design_variables": {"x": 1.0, "y": 2.0}
|
|
}
|
|
|
|
result = track_error(context)
|
|
|
|
assert result["error_tracked"] is True
|
|
assert result["error_type"] == "convergence_failure"
|
|
|
|
# Should have created error log
|
|
error_log = tmp_path / "error_history.jsonl"
|
|
assert error_log.exists()
|
|
|
|
# Verify log content
|
|
with open(error_log) as f:
|
|
log_entry = json.loads(f.readline())
|
|
|
|
assert log_entry["trial"] == 5
|
|
assert log_entry["error_type"] == "convergence_failure"
|
|
|
|
|
|
class TestPlaybookContextGeneration:
|
|
"""Test context generation for different scenarios."""
|
|
|
|
def test_context_for_optimization_task(self):
|
|
"""Test context generation for optimization."""
|
|
playbook = AtomizerPlaybook()
|
|
|
|
# Add various insights
|
|
playbook.add_insight(InsightCategory.STRATEGY, "Start with coarse mesh")
|
|
playbook.add_insight(InsightCategory.MISTAKE, "Avoid tiny elements")
|
|
playbook.add_insight(InsightCategory.TOOL, "Use TPE for exploration")
|
|
|
|
# Give them different scores
|
|
playbook.record_outcome("str-00001", helpful=True)
|
|
playbook.record_outcome("str-00001", helpful=True)
|
|
|
|
context = playbook.get_context_for_task("optimization", max_items=10)
|
|
|
|
assert "Playbook" in context
|
|
assert "STRATEGY" in context
|
|
assert "coarse mesh" in context
|
|
|
|
def test_context_filtering_by_confidence(self):
|
|
"""Test that low-confidence items are filtered."""
|
|
playbook = AtomizerPlaybook()
|
|
|
|
# Add item with low confidence
|
|
item = playbook.add_insight(InsightCategory.STRATEGY, "Questionable advice")
|
|
playbook.record_outcome(item.id, helpful=True)
|
|
playbook.record_outcome(item.id, helpful=False)
|
|
playbook.record_outcome(item.id, helpful=False)
|
|
playbook.record_outcome(item.id, helpful=False)
|
|
# confidence = 1/4 = 0.25
|
|
|
|
# High min_confidence should exclude it
|
|
context = playbook.get_context_for_task(
|
|
"optimization",
|
|
min_confidence=0.5
|
|
)
|
|
|
|
assert "Questionable advice" not in context
|
|
|
|
def test_context_ordering_by_score(self):
|
|
"""Test that items are ordered by net score."""
|
|
playbook = AtomizerPlaybook()
|
|
|
|
# Add items with different scores
|
|
low = playbook.add_insight(InsightCategory.STRATEGY, "Low score advice")
|
|
high = playbook.add_insight(InsightCategory.STRATEGY, "High score advice")
|
|
|
|
# Give high item better score
|
|
for _ in range(5):
|
|
playbook.record_outcome(high.id, helpful=True)
|
|
playbook.record_outcome(low.id, helpful=True)
|
|
|
|
context = playbook.get_context_for_task("optimization")
|
|
|
|
# High score should appear first
|
|
high_pos = context.find("High score")
|
|
low_pos = context.find("Low score")
|
|
assert high_pos < low_pos
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v"])
|