Files
Atomizer/tests/test_context_engineering.py

740 lines
24 KiB
Python
Raw Normal View History

"""
Test suite for context engineering components.
Tests the ACE (Agentic Context Engineering) implementation:
- Playbook: Knowledge store with helpful/harmful tracking
- Reflector: Outcome analysis and insight extraction
- SessionState: Context isolation
- Compaction: Long-running session management
- FeedbackLoop: Automated learning
"""
import pytest
from pathlib import Path
import tempfile
import json
from datetime import datetime
from optimization_engine.context.playbook import (
AtomizerPlaybook,
PlaybookItem,
InsightCategory
)
from optimization_engine.context.reflector import (
AtomizerReflector,
OptimizationOutcome
)
from optimization_engine.context.session_state import (
AtomizerSessionState,
TaskType,
ExposedState,
IsolatedState
)
from optimization_engine.context.compaction import (
CompactionManager,
ContextEvent,
EventType,
ContextBudgetManager
)
from optimization_engine.context.cache_monitor import (
ContextCacheOptimizer,
CacheStats,
StablePrefixBuilder
)
from optimization_engine.context.feedback_loop import (
FeedbackLoop
)
class TestAtomizerPlaybook:
"""Tests for the playbook system."""
def test_create_empty_playbook(self):
"""Test creating an empty playbook."""
playbook = AtomizerPlaybook()
assert len(playbook.items) == 0
assert playbook.version == 1
def test_add_insight(self):
"""Test adding insights to playbook."""
playbook = AtomizerPlaybook()
item = playbook.add_insight(
category=InsightCategory.STRATEGY,
content="Use shell elements for thin walls",
source_trial=1
)
assert item.id == "str-00001"
assert item.helpful_count == 0
assert item.harmful_count == 0
assert item.category == InsightCategory.STRATEGY
assert len(playbook.items) == 1
assert 1 in item.source_trials
def test_add_multiple_categories(self):
"""Test adding insights across different categories."""
playbook = AtomizerPlaybook()
playbook.add_insight(InsightCategory.STRATEGY, "Strategy 1")
playbook.add_insight(InsightCategory.MISTAKE, "Mistake 1")
playbook.add_insight(InsightCategory.TOOL, "Tool tip 1")
playbook.add_insight(InsightCategory.STRATEGY, "Strategy 2")
assert len(playbook.items) == 4
assert "str-00001" in playbook.items
assert "str-00002" in playbook.items
assert "mis-00001" in playbook.items
assert "tool-00001" in playbook.items
def test_deduplication(self):
"""Test that duplicate insights are merged."""
playbook = AtomizerPlaybook()
item1 = playbook.add_insight(InsightCategory.STRATEGY, "Use shell elements")
item2 = playbook.add_insight(InsightCategory.STRATEGY, "Use shell elements")
# Should merge into one item
assert len(playbook.items) == 1
# Helpful count incremented on duplicate
assert item2.helpful_count == 1
assert item1 is item2 # Same object
def test_outcome_tracking(self):
"""Test helpful/harmful tracking."""
playbook = AtomizerPlaybook()
item = playbook.add_insight(InsightCategory.STRATEGY, "Test insight")
playbook.record_outcome(item.id, helpful=True)
playbook.record_outcome(item.id, helpful=True)
playbook.record_outcome(item.id, helpful=False)
assert item.helpful_count == 2
assert item.harmful_count == 1
assert item.net_score == 1
assert item.confidence == 2/3
def test_confidence_calculation(self):
"""Test confidence score calculation."""
playbook = AtomizerPlaybook()
item = playbook.add_insight(InsightCategory.STRATEGY, "Test")
# Initial confidence is 0.5 (neutral)
assert item.confidence == 0.5
# After positive feedback
playbook.record_outcome(item.id, helpful=True)
assert item.confidence == 1.0
# After mixed feedback
playbook.record_outcome(item.id, helpful=False)
assert item.confidence == 0.5
def test_persistence(self, tmp_path):
"""Test save/load cycle."""
playbook = AtomizerPlaybook()
playbook.add_insight(InsightCategory.MISTAKE, "Don't do this", tags=["test"])
playbook.add_insight(InsightCategory.STRATEGY, "Do this instead")
# Record some outcomes
playbook.record_outcome("mis-00001", helpful=False)
playbook.record_outcome("str-00001", helpful=True)
save_path = tmp_path / "playbook.json"
playbook.save(save_path)
# Load and verify
loaded = AtomizerPlaybook.load(save_path)
assert len(loaded.items) == 2
assert "mis-00001" in loaded.items
assert loaded.items["mis-00001"].harmful_count == 1
assert loaded.items["str-00001"].helpful_count == 1
assert "test" in loaded.items["mis-00001"].tags
def test_pruning(self):
"""Test harmful item pruning."""
playbook = AtomizerPlaybook()
item = playbook.add_insight(InsightCategory.STRATEGY, "Bad advice")
# Record many harmful outcomes
for _ in range(5):
playbook.record_outcome(item.id, helpful=False)
assert item.net_score == -5
# Prune with threshold -3
removed = playbook.prune_harmful(threshold=-3)
assert removed == 1
assert len(playbook.items) == 0
def test_search_by_content(self):
"""Test content search functionality."""
playbook = AtomizerPlaybook()
playbook.add_insight(InsightCategory.STRATEGY, "Use shell elements for thin walls")
playbook.add_insight(InsightCategory.STRATEGY, "Solid elements for thick parts")
playbook.add_insight(InsightCategory.MISTAKE, "Don't use coarse mesh")
results = playbook.search_by_content("shell elements")
assert len(results) >= 1
assert "shell" in results[0].content.lower()
def test_get_context_for_task(self):
"""Test context string generation."""
playbook = AtomizerPlaybook()
playbook.add_insight(InsightCategory.STRATEGY, "Strategy 1")
playbook.add_insight(InsightCategory.MISTAKE, "Mistake 1")
# Make strategy have higher score
playbook.record_outcome("str-00001", helpful=True)
playbook.record_outcome("str-00001", helpful=True)
context = playbook.get_context_for_task("optimization")
assert "Playbook" in context
assert "str-00001" in context
assert "helpful=2" in context
class TestAtomizerReflector:
"""Tests for the reflector component."""
def test_create_reflector(self):
"""Test creating a reflector."""
playbook = AtomizerPlaybook()
reflector = AtomizerReflector(playbook)
assert reflector.playbook is playbook
assert len(reflector.pending_insights) == 0
def test_analyze_successful_trial(self):
"""Test analysis of successful trial."""
playbook = AtomizerPlaybook()
reflector = AtomizerReflector(playbook)
outcome = OptimizationOutcome(
trial_number=1,
success=True,
objective_value=100.0,
constraint_violations=[],
solver_errors=[],
design_variables={"thickness": 1.0, "width": 5.0},
extractor_used="mass_extractor",
duration_seconds=60
)
insights = reflector.analyze_trial(outcome)
# Should extract success pattern
assert len(insights) >= 1
assert any(i.helpful for i in insights)
assert 1 in reflector.analyzed_trials
def test_analyze_failed_trial(self):
"""Test analysis of failed trial."""
playbook = AtomizerPlaybook()
reflector = AtomizerReflector(playbook)
outcome = OptimizationOutcome(
trial_number=1,
success=False,
objective_value=None,
constraint_violations=["stress > 250 MPa"],
solver_errors=["convergence failure at iteration 50"],
design_variables={"thickness": 0.5},
extractor_used="stress_extractor",
duration_seconds=120
)
insights = reflector.analyze_trial(outcome)
# Should extract failure patterns
assert len(insights) >= 2 # At least error + constraint
assert any(i.category == InsightCategory.MISTAKE for i in insights)
assert not any(i.helpful for i in insights if i.category == InsightCategory.MISTAKE)
def test_analyze_mesh_error(self):
"""Test analysis of mesh-related error."""
playbook = AtomizerPlaybook()
reflector = AtomizerReflector(playbook)
outcome = OptimizationOutcome(
trial_number=5,
success=False,
objective_value=None,
constraint_violations=[],
solver_errors=["Element distortion: negative jacobian detected"],
design_variables={},
extractor_used="",
duration_seconds=30
)
insights = reflector.analyze_trial(outcome)
# Should identify mesh error
assert any("mesh" in str(i.tags).lower() for i in insights)
def test_commit_insights(self):
"""Test committing insights to playbook."""
playbook = AtomizerPlaybook()
reflector = AtomizerReflector(playbook)
outcome = OptimizationOutcome(
trial_number=1,
success=True,
objective_value=100.0,
constraint_violations=[],
solver_errors=[],
design_variables={"thickness": 1.0},
extractor_used="mass_extractor",
duration_seconds=60
)
reflector.analyze_trial(outcome)
count = reflector.commit_insights()
assert count > 0
assert len(playbook.items) > 0
assert len(reflector.pending_insights) == 0 # Cleared after commit
def test_analyze_study_completion(self):
"""Test study-level analysis."""
playbook = AtomizerPlaybook()
reflector = AtomizerReflector(playbook)
# High success rate study
insights = reflector.analyze_study_completion(
study_name="test_study",
total_trials=100,
best_value=50.0,
convergence_rate=0.95,
method="TPE"
)
assert len(insights) >= 1
assert any("robust" in i.content.lower() for i in insights)
class TestSessionState:
"""Tests for session state management."""
def test_create_session(self):
"""Test creating a session."""
session = AtomizerSessionState(session_id="test_session")
assert session.session_id == "test_session"
assert session.exposed.task_type is None
assert len(session.exposed.recent_actions) == 0
def test_set_task_type(self):
"""Test setting task type."""
session = AtomizerSessionState(session_id="test")
session.exposed.task_type = TaskType.CREATE_STUDY
assert session.exposed.task_type == TaskType.CREATE_STUDY
def test_add_action(self):
"""Test adding actions."""
session = AtomizerSessionState(session_id="test")
session.add_action("Created study directory")
session.add_action("Configured optimization")
assert len(session.exposed.recent_actions) == 2
assert "Created study" in session.exposed.recent_actions[0]
def test_action_compression(self):
"""Test automatic action compression."""
session = AtomizerSessionState(session_id="test")
# Add more actions than the limit
for i in range(15):
session.add_action(f"Action {i}")
# Should be compressed
assert len(session.exposed.recent_actions) <= 12
assert any("earlier actions" in a.lower() for a in session.exposed.recent_actions)
def test_add_error(self):
"""Test adding errors."""
session = AtomizerSessionState(session_id="test")
session.add_error("Solver failed", error_type="convergence")
session.add_error("Mesh error")
assert len(session.exposed.recent_errors) == 2
assert "[convergence]" in session.exposed.recent_errors[0]
def test_update_study_status(self):
"""Test updating study status."""
session = AtomizerSessionState(session_id="test")
session.update_study_status(
name="bracket_opt",
status="running",
trials_completed=25,
trials_total=100,
best_value=0.5,
best_trial=20
)
assert session.exposed.study_name == "bracket_opt"
assert session.exposed.trials_completed == 25
assert session.exposed.best_value == 0.5
def test_llm_context_generation(self):
"""Test LLM context string generation."""
session = AtomizerSessionState(session_id="test")
session.exposed.task_type = TaskType.RUN_OPTIMIZATION
session.exposed.study_name = "test_study"
session.exposed.trials_completed = 50
session.exposed.trials_total = 100
session.exposed.best_value = 0.5
context = session.get_llm_context()
assert "test_study" in context
assert "50" in context
assert "0.5" in context
assert "run_optimization" in context
def test_isolated_state_access(self):
"""Test accessing isolated state."""
session = AtomizerSessionState(session_id="test")
session.isolated.nx_model_path = "/path/to/model.prt"
# Should not appear in LLM context
context = session.get_llm_context()
assert "/path/to/model.prt" not in context
# But accessible via explicit load
path = session.load_isolated_data("nx_model_path")
assert path == "/path/to/model.prt"
def test_persistence(self, tmp_path):
"""Test save/load cycle."""
session = AtomizerSessionState(session_id="test_persist")
session.exposed.task_type = TaskType.ANALYZE_RESULTS
session.exposed.study_name = "persist_study"
session.add_action("Test action")
save_path = tmp_path / "session.json"
session.save(save_path)
loaded = AtomizerSessionState.load(save_path)
assert loaded.session_id == "test_persist"
assert loaded.exposed.task_type == TaskType.ANALYZE_RESULTS
assert loaded.exposed.study_name == "persist_study"
class TestCompactionManager:
"""Tests for context compaction."""
def test_create_manager(self):
"""Test creating compaction manager."""
manager = CompactionManager(compaction_threshold=10, keep_recent=5)
assert manager.compaction_threshold == 10
assert manager.keep_recent == 5
assert len(manager.events) == 0
def test_add_events(self):
"""Test adding events."""
manager = CompactionManager(compaction_threshold=50)
manager.add_trial_event(trial_number=1, success=True, objective=100.0)
manager.add_trial_event(trial_number=2, success=False)
assert len(manager.events) == 2
def test_compaction_trigger(self):
"""Test that compaction triggers at threshold."""
manager = CompactionManager(compaction_threshold=10, keep_recent=5)
for i in range(15):
manager.add_event(ContextEvent(
timestamp=datetime.now(),
event_type=EventType.TRIAL_COMPLETE,
summary=f"Trial {i} complete",
details={"trial_number": i, "objective": i * 0.1}
))
assert manager.compaction_count > 0
assert len(manager.events) <= 10
def test_error_preservation(self):
"""Test that errors are never compacted."""
manager = CompactionManager(compaction_threshold=10, keep_recent=3)
# Add error early
manager.add_error_event("Critical solver failure", "solver_error")
# Add many regular events
for i in range(20):
manager.add_trial_event(trial_number=i, success=True, objective=i)
# Error should still be present
errors = [e for e in manager.events if e.event_type == EventType.ERROR]
assert len(errors) == 1
assert "Critical solver failure" in errors[0].summary
def test_milestone_preservation(self):
"""Test that milestones are preserved."""
manager = CompactionManager(compaction_threshold=10, keep_recent=3)
manager.add_milestone("Optimization started", {"method": "TPE"})
for i in range(20):
manager.add_trial_event(trial_number=i, success=True)
# Milestone should be preserved
milestones = [e for e in manager.events if e.event_type == EventType.MILESTONE]
assert len(milestones) == 1
def test_context_string_generation(self):
"""Test context string generation."""
manager = CompactionManager()
manager.add_trial_event(trial_number=1, success=True, objective=100.0)
manager.add_error_event("Test error")
context = manager.get_context_string()
assert "Optimization History" in context
assert "Trial 1" in context
assert "Test error" in context
def test_get_stats(self):
"""Test statistics generation."""
manager = CompactionManager(compaction_threshold=10, keep_recent=5)
for i in range(15):
manager.add_trial_event(trial_number=i, success=i % 2 == 0)
stats = manager.get_stats()
assert stats["total_events"] <= 15
assert stats["compaction_count"] > 0
class TestCacheMonitor:
"""Tests for cache monitoring."""
def test_create_optimizer(self):
"""Test creating cache optimizer."""
optimizer = ContextCacheOptimizer()
assert optimizer.stats.total_requests == 0
assert optimizer.stats.cache_hits == 0
def test_prepare_context(self):
"""Test context preparation."""
optimizer = ContextCacheOptimizer()
context = optimizer.prepare_context(
stable_prefix="Stable content",
semi_stable="Session content",
dynamic="User message"
)
assert "Stable content" in context
assert "Session content" in context
assert "User message" in context
assert optimizer.stats.total_requests == 1
def test_cache_hit_detection(self):
"""Test cache hit detection."""
optimizer = ContextCacheOptimizer()
# First request
optimizer.prepare_context("Stable", "Semi", "Dynamic 1")
# Second request with same stable prefix
optimizer.prepare_context("Stable", "Semi", "Dynamic 2")
assert optimizer.stats.total_requests == 2
assert optimizer.stats.cache_hits == 1
def test_cache_miss_detection(self):
"""Test cache miss detection."""
optimizer = ContextCacheOptimizer()
optimizer.prepare_context("Stable 1", "Semi", "Dynamic")
optimizer.prepare_context("Stable 2", "Semi", "Dynamic") # Different prefix
assert optimizer.stats.cache_hits == 0
assert optimizer.stats.cache_misses == 2
def test_stable_prefix_builder(self):
"""Test stable prefix builder."""
builder = StablePrefixBuilder()
builder.add_identity("I am Atomizer")
builder.add_capabilities("I can optimize")
builder.add_tools("Tool definitions here")
prefix = builder.build()
assert "I am Atomizer" in prefix
assert "I can optimize" in prefix
# Identity should come before capabilities (order 10 < 20)
assert prefix.index("Atomizer") < prefix.index("optimize")
class TestFeedbackLoop:
"""Tests for the feedback loop."""
def test_create_feedback_loop(self, tmp_path):
"""Test creating feedback loop."""
playbook_path = tmp_path / "playbook.json"
feedback = FeedbackLoop(playbook_path)
assert feedback.playbook is not None
assert feedback._total_trials_processed == 0
def test_process_successful_trial(self, tmp_path):
"""Test processing successful trial."""
playbook_path = tmp_path / "playbook.json"
feedback = FeedbackLoop(playbook_path)
result = feedback.process_trial_result(
trial_number=1,
success=True,
objective_value=100.0,
design_variables={"thickness": 1.0}
)
assert result["trial_number"] == 1
assert result["success"] is True
assert feedback._total_trials_processed == 1
assert feedback._successful_trials == 1
def test_process_failed_trial(self, tmp_path):
"""Test processing failed trial."""
playbook_path = tmp_path / "playbook.json"
feedback = FeedbackLoop(playbook_path)
result = feedback.process_trial_result(
trial_number=1,
success=False,
objective_value=0.0,
design_variables={"thickness": 0.5},
errors=["Convergence failure"]
)
assert result["success"] is False
assert feedback._failed_trials == 1
def test_finalize_study(self, tmp_path):
"""Test study finalization."""
playbook_path = tmp_path / "playbook.json"
feedback = FeedbackLoop(playbook_path)
# Process some trials
for i in range(10):
feedback.process_trial_result(
trial_number=i,
success=i % 3 != 0,
objective_value=100 - i if i % 3 != 0 else 0,
design_variables={"x": i * 0.1}
)
# Finalize
result = feedback.finalize_study({
"name": "test_study",
"total_trials": 10,
"best_value": 91,
"convergence_rate": 0.7
})
assert result["insights_added"] > 0
assert result["playbook_size"] > 0
assert playbook_path.exists() # Should be saved
def test_playbook_item_attribution(self, tmp_path):
"""Test that playbook items get updated based on outcomes."""
playbook_path = tmp_path / "playbook.json"
# Pre-populate playbook
playbook = AtomizerPlaybook()
item = playbook.add_insight(InsightCategory.STRATEGY, "Test strategy")
playbook.save(playbook_path)
# Create feedback loop and process trials with this item active
feedback = FeedbackLoop(playbook_path)
feedback.process_trial_result(
trial_number=1,
success=True,
objective_value=100.0,
design_variables={},
context_items_used=[item.id]
)
feedback.process_trial_result(
trial_number=2,
success=True,
objective_value=95.0,
design_variables={},
context_items_used=[item.id]
)
# Item should have positive feedback
assert feedback.playbook.items[item.id].helpful_count == 2
class TestContextBudgetManager:
"""Tests for context budget management."""
def test_create_manager(self):
"""Test creating budget manager."""
manager = ContextBudgetManager()
assert manager.budget["total"] == 100000
assert "stable_prefix" in manager.budget
def test_estimate_tokens(self):
"""Test token estimation."""
manager = ContextBudgetManager()
tokens = manager.estimate_tokens("Hello world") # 11 chars
assert tokens == 2 # 11 / 4 = 2.75 -> 2
def test_update_usage(self):
"""Test usage tracking."""
manager = ContextBudgetManager()
result = manager.update_usage("stable_prefix", "x" * 20000) # 5000 tokens
assert result["section"] == "stable_prefix"
assert result["tokens"] == 5000
assert result["over_budget"] is False
def test_over_budget_warning(self):
"""Test over-budget detection."""
manager = ContextBudgetManager()
# Exceed stable_prefix budget (5000 tokens = 20000 chars)
result = manager.update_usage("stable_prefix", "x" * 40000) # 10000 tokens
assert result["over_budget"] is True
assert "warning" in result
def test_get_status(self):
"""Test overall status reporting."""
manager = ContextBudgetManager()
manager.update_usage("stable_prefix", "x" * 10000)
manager.update_usage("protocols", "x" * 20000)
status = manager.get_status()
assert "total_used" in status
assert "utilization" in status
assert "recommendations" in status
if __name__ == "__main__":
pytest.main([__file__, "-v"])