feat: Implement ACE Context Engineering framework (SYS_17)

Complete implementation of Agentic Context Engineering (ACE) framework:

Core modules (optimization_engine/context/):
- playbook.py: AtomizerPlaybook with helpful/harmful scoring
- reflector.py: AtomizerReflector for insight extraction
- session_state.py: Context isolation (exposed/isolated state)
- feedback_loop.py: Automated learning from trial results
- compaction.py: Long-session context management
- cache_monitor.py: KV-cache optimization tracking
- runner_integration.py: OptimizationRunner integration

Dashboard integration:
- context.py: 12 REST API endpoints for playbook management

Tests:
- test_context_engineering.py: 44 unit tests
- test_context_integration.py: 16 integration tests

Documentation:
- CONTEXT_ENGINEERING_REPORT.md: Comprehensive implementation report
- CONTEXT_ENGINEERING_API.md: Complete API reference
- SYS_17_CONTEXT_ENGINEERING.md: System protocol
- Updated cheatsheet with SYS_17 quick reference
- Enhanced bootstrap (00_BOOTSTRAP_V2.md)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-29 20:21:20 -05:00
parent 0110d80401
commit 773f8ff8af
19 changed files with 8184 additions and 2 deletions

View File

@@ -0,0 +1,739 @@
"""
Test suite for context engineering components.
Tests the ACE (Agentic Context Engineering) implementation:
- Playbook: Knowledge store with helpful/harmful tracking
- Reflector: Outcome analysis and insight extraction
- SessionState: Context isolation
- Compaction: Long-running session management
- FeedbackLoop: Automated learning
"""
import pytest
from pathlib import Path
import tempfile
import json
from datetime import datetime
from optimization_engine.context.playbook import (
AtomizerPlaybook,
PlaybookItem,
InsightCategory
)
from optimization_engine.context.reflector import (
AtomizerReflector,
OptimizationOutcome
)
from optimization_engine.context.session_state import (
AtomizerSessionState,
TaskType,
ExposedState,
IsolatedState
)
from optimization_engine.context.compaction import (
CompactionManager,
ContextEvent,
EventType,
ContextBudgetManager
)
from optimization_engine.context.cache_monitor import (
ContextCacheOptimizer,
CacheStats,
StablePrefixBuilder
)
from optimization_engine.context.feedback_loop import (
FeedbackLoop
)
class TestAtomizerPlaybook:
"""Tests for the playbook system."""
def test_create_empty_playbook(self):
"""Test creating an empty playbook."""
playbook = AtomizerPlaybook()
assert len(playbook.items) == 0
assert playbook.version == 1
def test_add_insight(self):
"""Test adding insights to playbook."""
playbook = AtomizerPlaybook()
item = playbook.add_insight(
category=InsightCategory.STRATEGY,
content="Use shell elements for thin walls",
source_trial=1
)
assert item.id == "str-00001"
assert item.helpful_count == 0
assert item.harmful_count == 0
assert item.category == InsightCategory.STRATEGY
assert len(playbook.items) == 1
assert 1 in item.source_trials
def test_add_multiple_categories(self):
"""Test adding insights across different categories."""
playbook = AtomizerPlaybook()
playbook.add_insight(InsightCategory.STRATEGY, "Strategy 1")
playbook.add_insight(InsightCategory.MISTAKE, "Mistake 1")
playbook.add_insight(InsightCategory.TOOL, "Tool tip 1")
playbook.add_insight(InsightCategory.STRATEGY, "Strategy 2")
assert len(playbook.items) == 4
assert "str-00001" in playbook.items
assert "str-00002" in playbook.items
assert "mis-00001" in playbook.items
assert "tool-00001" in playbook.items
def test_deduplication(self):
"""Test that duplicate insights are merged."""
playbook = AtomizerPlaybook()
item1 = playbook.add_insight(InsightCategory.STRATEGY, "Use shell elements")
item2 = playbook.add_insight(InsightCategory.STRATEGY, "Use shell elements")
# Should merge into one item
assert len(playbook.items) == 1
# Helpful count incremented on duplicate
assert item2.helpful_count == 1
assert item1 is item2 # Same object
def test_outcome_tracking(self):
"""Test helpful/harmful tracking."""
playbook = AtomizerPlaybook()
item = playbook.add_insight(InsightCategory.STRATEGY, "Test insight")
playbook.record_outcome(item.id, helpful=True)
playbook.record_outcome(item.id, helpful=True)
playbook.record_outcome(item.id, helpful=False)
assert item.helpful_count == 2
assert item.harmful_count == 1
assert item.net_score == 1
assert item.confidence == 2/3
def test_confidence_calculation(self):
"""Test confidence score calculation."""
playbook = AtomizerPlaybook()
item = playbook.add_insight(InsightCategory.STRATEGY, "Test")
# Initial confidence is 0.5 (neutral)
assert item.confidence == 0.5
# After positive feedback
playbook.record_outcome(item.id, helpful=True)
assert item.confidence == 1.0
# After mixed feedback
playbook.record_outcome(item.id, helpful=False)
assert item.confidence == 0.5
def test_persistence(self, tmp_path):
"""Test save/load cycle."""
playbook = AtomizerPlaybook()
playbook.add_insight(InsightCategory.MISTAKE, "Don't do this", tags=["test"])
playbook.add_insight(InsightCategory.STRATEGY, "Do this instead")
# Record some outcomes
playbook.record_outcome("mis-00001", helpful=False)
playbook.record_outcome("str-00001", helpful=True)
save_path = tmp_path / "playbook.json"
playbook.save(save_path)
# Load and verify
loaded = AtomizerPlaybook.load(save_path)
assert len(loaded.items) == 2
assert "mis-00001" in loaded.items
assert loaded.items["mis-00001"].harmful_count == 1
assert loaded.items["str-00001"].helpful_count == 1
assert "test" in loaded.items["mis-00001"].tags
def test_pruning(self):
"""Test harmful item pruning."""
playbook = AtomizerPlaybook()
item = playbook.add_insight(InsightCategory.STRATEGY, "Bad advice")
# Record many harmful outcomes
for _ in range(5):
playbook.record_outcome(item.id, helpful=False)
assert item.net_score == -5
# Prune with threshold -3
removed = playbook.prune_harmful(threshold=-3)
assert removed == 1
assert len(playbook.items) == 0
def test_search_by_content(self):
"""Test content search functionality."""
playbook = AtomizerPlaybook()
playbook.add_insight(InsightCategory.STRATEGY, "Use shell elements for thin walls")
playbook.add_insight(InsightCategory.STRATEGY, "Solid elements for thick parts")
playbook.add_insight(InsightCategory.MISTAKE, "Don't use coarse mesh")
results = playbook.search_by_content("shell elements")
assert len(results) >= 1
assert "shell" in results[0].content.lower()
def test_get_context_for_task(self):
"""Test context string generation."""
playbook = AtomizerPlaybook()
playbook.add_insight(InsightCategory.STRATEGY, "Strategy 1")
playbook.add_insight(InsightCategory.MISTAKE, "Mistake 1")
# Make strategy have higher score
playbook.record_outcome("str-00001", helpful=True)
playbook.record_outcome("str-00001", helpful=True)
context = playbook.get_context_for_task("optimization")
assert "Playbook" in context
assert "str-00001" in context
assert "helpful=2" in context
class TestAtomizerReflector:
"""Tests for the reflector component."""
def test_create_reflector(self):
"""Test creating a reflector."""
playbook = AtomizerPlaybook()
reflector = AtomizerReflector(playbook)
assert reflector.playbook is playbook
assert len(reflector.pending_insights) == 0
def test_analyze_successful_trial(self):
"""Test analysis of successful trial."""
playbook = AtomizerPlaybook()
reflector = AtomizerReflector(playbook)
outcome = OptimizationOutcome(
trial_number=1,
success=True,
objective_value=100.0,
constraint_violations=[],
solver_errors=[],
design_variables={"thickness": 1.0, "width": 5.0},
extractor_used="mass_extractor",
duration_seconds=60
)
insights = reflector.analyze_trial(outcome)
# Should extract success pattern
assert len(insights) >= 1
assert any(i.helpful for i in insights)
assert 1 in reflector.analyzed_trials
def test_analyze_failed_trial(self):
"""Test analysis of failed trial."""
playbook = AtomizerPlaybook()
reflector = AtomizerReflector(playbook)
outcome = OptimizationOutcome(
trial_number=1,
success=False,
objective_value=None,
constraint_violations=["stress > 250 MPa"],
solver_errors=["convergence failure at iteration 50"],
design_variables={"thickness": 0.5},
extractor_used="stress_extractor",
duration_seconds=120
)
insights = reflector.analyze_trial(outcome)
# Should extract failure patterns
assert len(insights) >= 2 # At least error + constraint
assert any(i.category == InsightCategory.MISTAKE for i in insights)
assert not any(i.helpful for i in insights if i.category == InsightCategory.MISTAKE)
def test_analyze_mesh_error(self):
"""Test analysis of mesh-related error."""
playbook = AtomizerPlaybook()
reflector = AtomizerReflector(playbook)
outcome = OptimizationOutcome(
trial_number=5,
success=False,
objective_value=None,
constraint_violations=[],
solver_errors=["Element distortion: negative jacobian detected"],
design_variables={},
extractor_used="",
duration_seconds=30
)
insights = reflector.analyze_trial(outcome)
# Should identify mesh error
assert any("mesh" in str(i.tags).lower() for i in insights)
def test_commit_insights(self):
"""Test committing insights to playbook."""
playbook = AtomizerPlaybook()
reflector = AtomizerReflector(playbook)
outcome = OptimizationOutcome(
trial_number=1,
success=True,
objective_value=100.0,
constraint_violations=[],
solver_errors=[],
design_variables={"thickness": 1.0},
extractor_used="mass_extractor",
duration_seconds=60
)
reflector.analyze_trial(outcome)
count = reflector.commit_insights()
assert count > 0
assert len(playbook.items) > 0
assert len(reflector.pending_insights) == 0 # Cleared after commit
def test_analyze_study_completion(self):
"""Test study-level analysis."""
playbook = AtomizerPlaybook()
reflector = AtomizerReflector(playbook)
# High success rate study
insights = reflector.analyze_study_completion(
study_name="test_study",
total_trials=100,
best_value=50.0,
convergence_rate=0.95,
method="TPE"
)
assert len(insights) >= 1
assert any("robust" in i.content.lower() for i in insights)
class TestSessionState:
"""Tests for session state management."""
def test_create_session(self):
"""Test creating a session."""
session = AtomizerSessionState(session_id="test_session")
assert session.session_id == "test_session"
assert session.exposed.task_type is None
assert len(session.exposed.recent_actions) == 0
def test_set_task_type(self):
"""Test setting task type."""
session = AtomizerSessionState(session_id="test")
session.exposed.task_type = TaskType.CREATE_STUDY
assert session.exposed.task_type == TaskType.CREATE_STUDY
def test_add_action(self):
"""Test adding actions."""
session = AtomizerSessionState(session_id="test")
session.add_action("Created study directory")
session.add_action("Configured optimization")
assert len(session.exposed.recent_actions) == 2
assert "Created study" in session.exposed.recent_actions[0]
def test_action_compression(self):
"""Test automatic action compression."""
session = AtomizerSessionState(session_id="test")
# Add more actions than the limit
for i in range(15):
session.add_action(f"Action {i}")
# Should be compressed
assert len(session.exposed.recent_actions) <= 12
assert any("earlier actions" in a.lower() for a in session.exposed.recent_actions)
def test_add_error(self):
"""Test adding errors."""
session = AtomizerSessionState(session_id="test")
session.add_error("Solver failed", error_type="convergence")
session.add_error("Mesh error")
assert len(session.exposed.recent_errors) == 2
assert "[convergence]" in session.exposed.recent_errors[0]
def test_update_study_status(self):
"""Test updating study status."""
session = AtomizerSessionState(session_id="test")
session.update_study_status(
name="bracket_opt",
status="running",
trials_completed=25,
trials_total=100,
best_value=0.5,
best_trial=20
)
assert session.exposed.study_name == "bracket_opt"
assert session.exposed.trials_completed == 25
assert session.exposed.best_value == 0.5
def test_llm_context_generation(self):
"""Test LLM context string generation."""
session = AtomizerSessionState(session_id="test")
session.exposed.task_type = TaskType.RUN_OPTIMIZATION
session.exposed.study_name = "test_study"
session.exposed.trials_completed = 50
session.exposed.trials_total = 100
session.exposed.best_value = 0.5
context = session.get_llm_context()
assert "test_study" in context
assert "50" in context
assert "0.5" in context
assert "run_optimization" in context
def test_isolated_state_access(self):
"""Test accessing isolated state."""
session = AtomizerSessionState(session_id="test")
session.isolated.nx_model_path = "/path/to/model.prt"
# Should not appear in LLM context
context = session.get_llm_context()
assert "/path/to/model.prt" not in context
# But accessible via explicit load
path = session.load_isolated_data("nx_model_path")
assert path == "/path/to/model.prt"
def test_persistence(self, tmp_path):
"""Test save/load cycle."""
session = AtomizerSessionState(session_id="test_persist")
session.exposed.task_type = TaskType.ANALYZE_RESULTS
session.exposed.study_name = "persist_study"
session.add_action("Test action")
save_path = tmp_path / "session.json"
session.save(save_path)
loaded = AtomizerSessionState.load(save_path)
assert loaded.session_id == "test_persist"
assert loaded.exposed.task_type == TaskType.ANALYZE_RESULTS
assert loaded.exposed.study_name == "persist_study"
class TestCompactionManager:
"""Tests for context compaction."""
def test_create_manager(self):
"""Test creating compaction manager."""
manager = CompactionManager(compaction_threshold=10, keep_recent=5)
assert manager.compaction_threshold == 10
assert manager.keep_recent == 5
assert len(manager.events) == 0
def test_add_events(self):
"""Test adding events."""
manager = CompactionManager(compaction_threshold=50)
manager.add_trial_event(trial_number=1, success=True, objective=100.0)
manager.add_trial_event(trial_number=2, success=False)
assert len(manager.events) == 2
def test_compaction_trigger(self):
"""Test that compaction triggers at threshold."""
manager = CompactionManager(compaction_threshold=10, keep_recent=5)
for i in range(15):
manager.add_event(ContextEvent(
timestamp=datetime.now(),
event_type=EventType.TRIAL_COMPLETE,
summary=f"Trial {i} complete",
details={"trial_number": i, "objective": i * 0.1}
))
assert manager.compaction_count > 0
assert len(manager.events) <= 10
def test_error_preservation(self):
"""Test that errors are never compacted."""
manager = CompactionManager(compaction_threshold=10, keep_recent=3)
# Add error early
manager.add_error_event("Critical solver failure", "solver_error")
# Add many regular events
for i in range(20):
manager.add_trial_event(trial_number=i, success=True, objective=i)
# Error should still be present
errors = [e for e in manager.events if e.event_type == EventType.ERROR]
assert len(errors) == 1
assert "Critical solver failure" in errors[0].summary
def test_milestone_preservation(self):
"""Test that milestones are preserved."""
manager = CompactionManager(compaction_threshold=10, keep_recent=3)
manager.add_milestone("Optimization started", {"method": "TPE"})
for i in range(20):
manager.add_trial_event(trial_number=i, success=True)
# Milestone should be preserved
milestones = [e for e in manager.events if e.event_type == EventType.MILESTONE]
assert len(milestones) == 1
def test_context_string_generation(self):
"""Test context string generation."""
manager = CompactionManager()
manager.add_trial_event(trial_number=1, success=True, objective=100.0)
manager.add_error_event("Test error")
context = manager.get_context_string()
assert "Optimization History" in context
assert "Trial 1" in context
assert "Test error" in context
def test_get_stats(self):
"""Test statistics generation."""
manager = CompactionManager(compaction_threshold=10, keep_recent=5)
for i in range(15):
manager.add_trial_event(trial_number=i, success=i % 2 == 0)
stats = manager.get_stats()
assert stats["total_events"] <= 15
assert stats["compaction_count"] > 0
class TestCacheMonitor:
"""Tests for cache monitoring."""
def test_create_optimizer(self):
"""Test creating cache optimizer."""
optimizer = ContextCacheOptimizer()
assert optimizer.stats.total_requests == 0
assert optimizer.stats.cache_hits == 0
def test_prepare_context(self):
"""Test context preparation."""
optimizer = ContextCacheOptimizer()
context = optimizer.prepare_context(
stable_prefix="Stable content",
semi_stable="Session content",
dynamic="User message"
)
assert "Stable content" in context
assert "Session content" in context
assert "User message" in context
assert optimizer.stats.total_requests == 1
def test_cache_hit_detection(self):
"""Test cache hit detection."""
optimizer = ContextCacheOptimizer()
# First request
optimizer.prepare_context("Stable", "Semi", "Dynamic 1")
# Second request with same stable prefix
optimizer.prepare_context("Stable", "Semi", "Dynamic 2")
assert optimizer.stats.total_requests == 2
assert optimizer.stats.cache_hits == 1
def test_cache_miss_detection(self):
"""Test cache miss detection."""
optimizer = ContextCacheOptimizer()
optimizer.prepare_context("Stable 1", "Semi", "Dynamic")
optimizer.prepare_context("Stable 2", "Semi", "Dynamic") # Different prefix
assert optimizer.stats.cache_hits == 0
assert optimizer.stats.cache_misses == 2
def test_stable_prefix_builder(self):
"""Test stable prefix builder."""
builder = StablePrefixBuilder()
builder.add_identity("I am Atomizer")
builder.add_capabilities("I can optimize")
builder.add_tools("Tool definitions here")
prefix = builder.build()
assert "I am Atomizer" in prefix
assert "I can optimize" in prefix
# Identity should come before capabilities (order 10 < 20)
assert prefix.index("Atomizer") < prefix.index("optimize")
class TestFeedbackLoop:
"""Tests for the feedback loop."""
def test_create_feedback_loop(self, tmp_path):
"""Test creating feedback loop."""
playbook_path = tmp_path / "playbook.json"
feedback = FeedbackLoop(playbook_path)
assert feedback.playbook is not None
assert feedback._total_trials_processed == 0
def test_process_successful_trial(self, tmp_path):
"""Test processing successful trial."""
playbook_path = tmp_path / "playbook.json"
feedback = FeedbackLoop(playbook_path)
result = feedback.process_trial_result(
trial_number=1,
success=True,
objective_value=100.0,
design_variables={"thickness": 1.0}
)
assert result["trial_number"] == 1
assert result["success"] is True
assert feedback._total_trials_processed == 1
assert feedback._successful_trials == 1
def test_process_failed_trial(self, tmp_path):
"""Test processing failed trial."""
playbook_path = tmp_path / "playbook.json"
feedback = FeedbackLoop(playbook_path)
result = feedback.process_trial_result(
trial_number=1,
success=False,
objective_value=0.0,
design_variables={"thickness": 0.5},
errors=["Convergence failure"]
)
assert result["success"] is False
assert feedback._failed_trials == 1
def test_finalize_study(self, tmp_path):
"""Test study finalization."""
playbook_path = tmp_path / "playbook.json"
feedback = FeedbackLoop(playbook_path)
# Process some trials
for i in range(10):
feedback.process_trial_result(
trial_number=i,
success=i % 3 != 0,
objective_value=100 - i if i % 3 != 0 else 0,
design_variables={"x": i * 0.1}
)
# Finalize
result = feedback.finalize_study({
"name": "test_study",
"total_trials": 10,
"best_value": 91,
"convergence_rate": 0.7
})
assert result["insights_added"] > 0
assert result["playbook_size"] > 0
assert playbook_path.exists() # Should be saved
def test_playbook_item_attribution(self, tmp_path):
"""Test that playbook items get updated based on outcomes."""
playbook_path = tmp_path / "playbook.json"
# Pre-populate playbook
playbook = AtomizerPlaybook()
item = playbook.add_insight(InsightCategory.STRATEGY, "Test strategy")
playbook.save(playbook_path)
# Create feedback loop and process trials with this item active
feedback = FeedbackLoop(playbook_path)
feedback.process_trial_result(
trial_number=1,
success=True,
objective_value=100.0,
design_variables={},
context_items_used=[item.id]
)
feedback.process_trial_result(
trial_number=2,
success=True,
objective_value=95.0,
design_variables={},
context_items_used=[item.id]
)
# Item should have positive feedback
assert feedback.playbook.items[item.id].helpful_count == 2
class TestContextBudgetManager:
"""Tests for context budget management."""
def test_create_manager(self):
"""Test creating budget manager."""
manager = ContextBudgetManager()
assert manager.budget["total"] == 100000
assert "stable_prefix" in manager.budget
def test_estimate_tokens(self):
"""Test token estimation."""
manager = ContextBudgetManager()
tokens = manager.estimate_tokens("Hello world") # 11 chars
assert tokens == 2 # 11 / 4 = 2.75 -> 2
def test_update_usage(self):
"""Test usage tracking."""
manager = ContextBudgetManager()
result = manager.update_usage("stable_prefix", "x" * 20000) # 5000 tokens
assert result["section"] == "stable_prefix"
assert result["tokens"] == 5000
assert result["over_budget"] is False
def test_over_budget_warning(self):
"""Test over-budget detection."""
manager = ContextBudgetManager()
# Exceed stable_prefix budget (5000 tokens = 20000 chars)
result = manager.update_usage("stable_prefix", "x" * 40000) # 10000 tokens
assert result["over_budget"] is True
assert "warning" in result
def test_get_status(self):
"""Test overall status reporting."""
manager = ContextBudgetManager()
manager.update_usage("stable_prefix", "x" * 10000)
manager.update_usage("protocols", "x" * 20000)
status = manager.get_status()
assert "total_used" in status
assert "utilization" in status
assert "recommendations" in status
if __name__ == "__main__":
pytest.main([__file__, "-v"])

View File

@@ -0,0 +1,463 @@
"""
Integration test for full context engineering pipeline.
Tests the complete ACE (Agentic Context Engineering) workflow:
1. Starting fresh session
2. Running optimization with successes and failures
3. Verifying playbook learns from outcomes
4. Validating persistence across sessions
5. Testing context compaction under load
"""
import pytest
from pathlib import Path
import tempfile
import json
from datetime import datetime, timedelta
import random
from optimization_engine.context.playbook import AtomizerPlaybook, InsightCategory
from optimization_engine.context.reflector import AtomizerReflector, OptimizationOutcome
from optimization_engine.context.session_state import AtomizerSessionState, TaskType
from optimization_engine.context.feedback_loop import FeedbackLoop
from optimization_engine.context.compaction import CompactionManager, EventType
from optimization_engine.context.cache_monitor import ContextCacheOptimizer, StablePrefixBuilder
class TestFullOptimizationPipeline:
"""End-to-end test of optimization with context engineering."""
def test_complete_optimization_cycle(self, tmp_path):
"""
Simulates a complete optimization run:
1. Initialize context engineering
2. Process multiple trials (mix of success/failure)
3. Finalize and commit learning
4. Verify playbook has learned
"""
playbook_path = tmp_path / "playbook.json"
# Initialize feedback loop
feedback = FeedbackLoop(playbook_path)
# Simulate study with mixed results
trial_results = []
for i in range(20):
success = random.random() > 0.3 # 70% success rate
obj_value = 100 - i * 2 + random.uniform(-5, 5) if success else None
result = feedback.process_trial_result(
trial_number=i,
success=success,
objective_value=obj_value if success else 0.0,
design_variables={
"thickness": 0.5 + i * 0.1,
"width": 10 + i * 0.5
},
context_items_used=[],
errors=["convergence failure"] if not success else None
)
trial_results.append({
"trial": i,
"success": success,
"insights": result.get("insights_extracted", 0)
})
# Finalize study
successful = sum(1 for r in trial_results if r["success"])
final_result = feedback.finalize_study({
"name": "integration_test_study",
"total_trials": 20,
"best_value": min(
r.get("objective_value", float('inf'))
for r in trial_results if r["success"]
) if successful > 0 else 0,
"convergence_rate": successful / 20
})
# Verify learning occurred
assert final_result["insights_added"] > 0
assert final_result["playbook_size"] > 0
assert playbook_path.exists()
# Load and verify playbook content
playbook = AtomizerPlaybook.load(playbook_path)
# Should have some mistake insights from failures
mistakes = [
item for item in playbook.items.values()
if item.category == InsightCategory.MISTAKE
]
assert len(mistakes) > 0
def test_learning_persistence_across_sessions(self, tmp_path):
"""
Test that learning persists across multiple "sessions".
"""
playbook_path = tmp_path / "playbook.json"
# Session 1: Generate initial learning
feedback1 = FeedbackLoop(playbook_path)
for i in range(10):
feedback1.process_trial_result(
trial_number=i,
success=True,
objective_value=100 - i,
design_variables={"x": i}
)
feedback1.finalize_study({
"name": "session1",
"total_trials": 10,
"best_value": 91,
"convergence_rate": 1.0
})
# Verify session 1 created insights
pb1 = AtomizerPlaybook.load(playbook_path)
session1_items = len(pb1.items)
assert session1_items > 0
# Session 2: Continue learning
feedback2 = FeedbackLoop(playbook_path)
# Should have loaded existing playbook
assert len(feedback2.playbook.items) == session1_items
# Add more trials
for i in range(10, 20):
feedback2.process_trial_result(
trial_number=i,
success=i % 2 == 0,
objective_value=100 - i if i % 2 == 0 else 0.0,
design_variables={"x": i},
errors=["test error"] if i % 2 != 0 else None
)
feedback2.finalize_study({
"name": "session2",
"total_trials": 10,
"best_value": 80,
"convergence_rate": 0.5
})
# Verify combined learning
pb2 = AtomizerPlaybook.load(playbook_path)
assert len(pb2.items) >= session1_items # At least as many items
def test_playbook_pruning_over_time(self, tmp_path):
"""
Test that harmful insights get pruned.
"""
playbook_path = tmp_path / "playbook.json"
# Create playbook with a "bad" insight
playbook = AtomizerPlaybook()
bad_item = playbook.add_insight(
InsightCategory.STRATEGY,
"Use extremely coarse mesh" # Bad advice
)
# Give it many harmful outcomes
for _ in range(10):
playbook.record_outcome(bad_item.id, helpful=False)
playbook.save(playbook_path)
# Create feedback loop and finalize
feedback = FeedbackLoop(playbook_path)
# Process a few trials
for i in range(5):
feedback.process_trial_result(
trial_number=i,
success=True,
objective_value=100,
design_variables={}
)
feedback.finalize_study({
"name": "prune_test",
"total_trials": 5,
"best_value": 100,
"convergence_rate": 1.0
})
# Bad insight should be pruned (net_score -10 < threshold -3)
final_playbook = AtomizerPlaybook.load(playbook_path)
assert bad_item.id not in final_playbook.items
def test_context_compaction_under_load(self, tmp_path):
"""
Test that compaction works correctly under high trial volume.
"""
manager = CompactionManager(
compaction_threshold=20,
keep_recent=10,
keep_errors=True
)
# Simulate 100 trials
errors_added = 0
for i in range(100):
success = i % 5 != 0
if success:
manager.add_trial_event(
trial_number=i,
success=True,
objective=100 - i * 0.5,
duration=random.uniform(30, 120)
)
else:
manager.add_trial_event(
trial_number=i,
success=False,
duration=random.uniform(30, 120)
)
manager.add_error_event(
f"Error in trial {i}",
error_type="test_error"
)
errors_added += 1
# Should have compacted
stats = manager.get_stats()
assert stats["compaction_count"] > 0
# All errors should be preserved
assert stats["error_events"] == errors_added
# Total events should be bounded
assert stats["total_events"] < 100 # Compaction reduced count
# Context string should be reasonable length
context = manager.get_context_string()
assert len(context) < 50000 # Not too long
def test_session_state_throughout_optimization(self, tmp_path):
"""
Test session state tracking throughout an optimization.
"""
session = AtomizerSessionState(session_id="integration_test")
session.exposed.task_type = TaskType.RUN_OPTIMIZATION
session.exposed.study_name = "state_test"
# Simulate optimization progress
for i in range(20):
session.add_action(f"Processing trial {i}")
if i % 5 == 0 and i > 0:
session.update_study_status(
name="state_test",
status="running",
trials_completed=i,
trials_total=20,
best_value=100 - i,
best_trial=i
)
if i % 7 == 0:
session.add_error(f"Minor issue at trial {i}")
# Verify state
assert session.exposed.trials_completed == 15 # Last update at i=15
assert len(session.exposed.recent_errors) <= 5 # Bounded
# Context should include key information
context = session.get_llm_context()
assert "state_test" in context
assert "running" in context
def test_cache_optimization_effectiveness(self):
"""
Test that cache optimization actually works.
"""
optimizer = ContextCacheOptimizer()
# Build stable prefix (should be cached)
builder = StablePrefixBuilder()
builder.add_identity("I am Atomizer, an optimization assistant")
builder.add_capabilities("I can run FEA optimizations")
builder.add_tools("Available tools: NX, Nastran, Optuna")
stable_prefix = builder.build()
# Simulate 10 requests with same stable prefix
for i in range(10):
optimizer.prepare_context(
stable_prefix=stable_prefix,
semi_stable=f"Session info for request {i}",
dynamic=f"User message {i}"
)
# Should have high cache hit rate
assert optimizer.stats.hit_rate >= 0.9 # 9/10 hits
assert optimizer.stats.estimated_savings_percent >= 80 # Good savings
class TestReflectorLearningPatterns:
"""Test that the reflector extracts useful patterns."""
def test_convergence_pattern_learning(self, tmp_path):
"""Test learning from convergence failures."""
playbook = AtomizerPlaybook()
reflector = AtomizerReflector(playbook)
# Simulate convergence failures
for i in range(5):
outcome = OptimizationOutcome(
trial_number=i,
success=False,
objective_value=None,
solver_errors=["Convergence failure at iteration 100"],
design_variables={"x": i * 0.1},
duration_seconds=300
)
reflector.analyze_trial(outcome)
reflector.commit_insights()
# Should have learned about convergence issues
convergence_insights = [
item for item in playbook.items.values()
if "convergence" in item.content.lower()
]
assert len(convergence_insights) > 0
def test_success_pattern_learning(self, tmp_path):
"""Test learning from successful designs."""
playbook = AtomizerPlaybook()
reflector = AtomizerReflector(playbook)
# Simulate successful designs with similar characteristics
for i in range(5):
outcome = OptimizationOutcome(
trial_number=i,
success=True,
objective_value=50 + i,
design_variables={
"thickness": 1.0 + i * 0.1, # All around 1.0-1.5
"width": 10.0 # Consistent
},
duration_seconds=60
)
reflector.analyze_trial(outcome)
reflector.commit_insights()
# Should have learned success patterns
success_insights = [
item for item in playbook.items.values()
if item.category == InsightCategory.STRATEGY
]
assert len(success_insights) > 0
class TestErrorTrackerIntegration:
"""Test error tracker plugin integration."""
def test_error_classification(self):
"""Test error classification function."""
from optimization_engine.plugins.post_solve.error_tracker import classify_error
assert classify_error("Convergence failure at iteration 50") == "convergence_failure"
assert classify_error("Element distortion detected") == "mesh_error"
assert classify_error("Matrix singularity") == "singularity"
assert classify_error("Out of memory") == "memory_error"
assert classify_error("License checkout failed") == "license_error"
assert classify_error("Random unknown error") == "unknown_error"
def test_error_tracking_hook(self, tmp_path):
"""Test the error tracking hook function."""
from optimization_engine.plugins.post_solve.error_tracker import track_error
context = {
"trial_number": 5,
"working_dir": str(tmp_path),
"output_dir": str(tmp_path),
"solver_returncode": 1,
"error_message": "Convergence failure at iteration 100",
"design_variables": {"x": 1.0, "y": 2.0}
}
result = track_error(context)
assert result["error_tracked"] is True
assert result["error_type"] == "convergence_failure"
# Should have created error log
error_log = tmp_path / "error_history.jsonl"
assert error_log.exists()
# Verify log content
with open(error_log) as f:
log_entry = json.loads(f.readline())
assert log_entry["trial"] == 5
assert log_entry["error_type"] == "convergence_failure"
class TestPlaybookContextGeneration:
"""Test context generation for different scenarios."""
def test_context_for_optimization_task(self):
"""Test context generation for optimization."""
playbook = AtomizerPlaybook()
# Add various insights
playbook.add_insight(InsightCategory.STRATEGY, "Start with coarse mesh")
playbook.add_insight(InsightCategory.MISTAKE, "Avoid tiny elements")
playbook.add_insight(InsightCategory.TOOL, "Use TPE for exploration")
# Give them different scores
playbook.record_outcome("str-00001", helpful=True)
playbook.record_outcome("str-00001", helpful=True)
context = playbook.get_context_for_task("optimization", max_items=10)
assert "Playbook" in context
assert "STRATEGY" in context
assert "coarse mesh" in context
def test_context_filtering_by_confidence(self):
"""Test that low-confidence items are filtered."""
playbook = AtomizerPlaybook()
# Add item with low confidence
item = playbook.add_insight(InsightCategory.STRATEGY, "Questionable advice")
playbook.record_outcome(item.id, helpful=True)
playbook.record_outcome(item.id, helpful=False)
playbook.record_outcome(item.id, helpful=False)
playbook.record_outcome(item.id, helpful=False)
# confidence = 1/4 = 0.25
# High min_confidence should exclude it
context = playbook.get_context_for_task(
"optimization",
min_confidence=0.5
)
assert "Questionable advice" not in context
def test_context_ordering_by_score(self):
"""Test that items are ordered by net score."""
playbook = AtomizerPlaybook()
# Add items with different scores
low = playbook.add_insight(InsightCategory.STRATEGY, "Low score advice")
high = playbook.add_insight(InsightCategory.STRATEGY, "High score advice")
# Give high item better score
for _ in range(5):
playbook.record_outcome(high.id, helpful=True)
playbook.record_outcome(low.id, helpful=True)
context = playbook.get_context_for_task("optimization")
# High score should appear first
high_pos = context.find("High score")
low_pos = context.find("Low score")
assert high_pos < low_pos
if __name__ == "__main__":
pytest.main([__file__, "-v"])