Files
Atomizer/tests/test_context_integration.py

464 lines
16 KiB
Python
Raw Permalink Normal View History

"""
Integration test for full context engineering pipeline.
Tests the complete ACE (Agentic Context Engineering) workflow:
1. Starting fresh session
2. Running optimization with successes and failures
3. Verifying playbook learns from outcomes
4. Validating persistence across sessions
5. Testing context compaction under load
"""
import pytest
from pathlib import Path
import tempfile
import json
from datetime import datetime, timedelta
import random
from optimization_engine.context.playbook import AtomizerPlaybook, InsightCategory
from optimization_engine.context.reflector import AtomizerReflector, OptimizationOutcome
from optimization_engine.context.session_state import AtomizerSessionState, TaskType
from optimization_engine.context.feedback_loop import FeedbackLoop
from optimization_engine.context.compaction import CompactionManager, EventType
from optimization_engine.context.cache_monitor import ContextCacheOptimizer, StablePrefixBuilder
class TestFullOptimizationPipeline:
"""End-to-end test of optimization with context engineering."""
def test_complete_optimization_cycle(self, tmp_path):
"""
Simulates a complete optimization run:
1. Initialize context engineering
2. Process multiple trials (mix of success/failure)
3. Finalize and commit learning
4. Verify playbook has learned
"""
playbook_path = tmp_path / "playbook.json"
# Initialize feedback loop
feedback = FeedbackLoop(playbook_path)
# Simulate study with mixed results
trial_results = []
for i in range(20):
success = random.random() > 0.3 # 70% success rate
obj_value = 100 - i * 2 + random.uniform(-5, 5) if success else None
result = feedback.process_trial_result(
trial_number=i,
success=success,
objective_value=obj_value if success else 0.0,
design_variables={
"thickness": 0.5 + i * 0.1,
"width": 10 + i * 0.5
},
context_items_used=[],
errors=["convergence failure"] if not success else None
)
trial_results.append({
"trial": i,
"success": success,
"insights": result.get("insights_extracted", 0)
})
# Finalize study
successful = sum(1 for r in trial_results if r["success"])
final_result = feedback.finalize_study({
"name": "integration_test_study",
"total_trials": 20,
"best_value": min(
r.get("objective_value", float('inf'))
for r in trial_results if r["success"]
) if successful > 0 else 0,
"convergence_rate": successful / 20
})
# Verify learning occurred
assert final_result["insights_added"] > 0
assert final_result["playbook_size"] > 0
assert playbook_path.exists()
# Load and verify playbook content
playbook = AtomizerPlaybook.load(playbook_path)
# Should have some mistake insights from failures
mistakes = [
item for item in playbook.items.values()
if item.category == InsightCategory.MISTAKE
]
assert len(mistakes) > 0
def test_learning_persistence_across_sessions(self, tmp_path):
"""
Test that learning persists across multiple "sessions".
"""
playbook_path = tmp_path / "playbook.json"
# Session 1: Generate initial learning
feedback1 = FeedbackLoop(playbook_path)
for i in range(10):
feedback1.process_trial_result(
trial_number=i,
success=True,
objective_value=100 - i,
design_variables={"x": i}
)
feedback1.finalize_study({
"name": "session1",
"total_trials": 10,
"best_value": 91,
"convergence_rate": 1.0
})
# Verify session 1 created insights
pb1 = AtomizerPlaybook.load(playbook_path)
session1_items = len(pb1.items)
assert session1_items > 0
# Session 2: Continue learning
feedback2 = FeedbackLoop(playbook_path)
# Should have loaded existing playbook
assert len(feedback2.playbook.items) == session1_items
# Add more trials
for i in range(10, 20):
feedback2.process_trial_result(
trial_number=i,
success=i % 2 == 0,
objective_value=100 - i if i % 2 == 0 else 0.0,
design_variables={"x": i},
errors=["test error"] if i % 2 != 0 else None
)
feedback2.finalize_study({
"name": "session2",
"total_trials": 10,
"best_value": 80,
"convergence_rate": 0.5
})
# Verify combined learning
pb2 = AtomizerPlaybook.load(playbook_path)
assert len(pb2.items) >= session1_items # At least as many items
def test_playbook_pruning_over_time(self, tmp_path):
"""
Test that harmful insights get pruned.
"""
playbook_path = tmp_path / "playbook.json"
# Create playbook with a "bad" insight
playbook = AtomizerPlaybook()
bad_item = playbook.add_insight(
InsightCategory.STRATEGY,
"Use extremely coarse mesh" # Bad advice
)
# Give it many harmful outcomes
for _ in range(10):
playbook.record_outcome(bad_item.id, helpful=False)
playbook.save(playbook_path)
# Create feedback loop and finalize
feedback = FeedbackLoop(playbook_path)
# Process a few trials
for i in range(5):
feedback.process_trial_result(
trial_number=i,
success=True,
objective_value=100,
design_variables={}
)
feedback.finalize_study({
"name": "prune_test",
"total_trials": 5,
"best_value": 100,
"convergence_rate": 1.0
})
# Bad insight should be pruned (net_score -10 < threshold -3)
final_playbook = AtomizerPlaybook.load(playbook_path)
assert bad_item.id not in final_playbook.items
def test_context_compaction_under_load(self, tmp_path):
"""
Test that compaction works correctly under high trial volume.
"""
manager = CompactionManager(
compaction_threshold=20,
keep_recent=10,
keep_errors=True
)
# Simulate 100 trials
errors_added = 0
for i in range(100):
success = i % 5 != 0
if success:
manager.add_trial_event(
trial_number=i,
success=True,
objective=100 - i * 0.5,
duration=random.uniform(30, 120)
)
else:
manager.add_trial_event(
trial_number=i,
success=False,
duration=random.uniform(30, 120)
)
manager.add_error_event(
f"Error in trial {i}",
error_type="test_error"
)
errors_added += 1
# Should have compacted
stats = manager.get_stats()
assert stats["compaction_count"] > 0
# All errors should be preserved
assert stats["error_events"] == errors_added
# Total events should be bounded
assert stats["total_events"] < 100 # Compaction reduced count
# Context string should be reasonable length
context = manager.get_context_string()
assert len(context) < 50000 # Not too long
def test_session_state_throughout_optimization(self, tmp_path):
"""
Test session state tracking throughout an optimization.
"""
session = AtomizerSessionState(session_id="integration_test")
session.exposed.task_type = TaskType.RUN_OPTIMIZATION
session.exposed.study_name = "state_test"
# Simulate optimization progress
for i in range(20):
session.add_action(f"Processing trial {i}")
if i % 5 == 0 and i > 0:
session.update_study_status(
name="state_test",
status="running",
trials_completed=i,
trials_total=20,
best_value=100 - i,
best_trial=i
)
if i % 7 == 0:
session.add_error(f"Minor issue at trial {i}")
# Verify state
assert session.exposed.trials_completed == 15 # Last update at i=15
assert len(session.exposed.recent_errors) <= 5 # Bounded
# Context should include key information
context = session.get_llm_context()
assert "state_test" in context
assert "running" in context
def test_cache_optimization_effectiveness(self):
"""
Test that cache optimization actually works.
"""
optimizer = ContextCacheOptimizer()
# Build stable prefix (should be cached)
builder = StablePrefixBuilder()
builder.add_identity("I am Atomizer, an optimization assistant")
builder.add_capabilities("I can run FEA optimizations")
builder.add_tools("Available tools: NX, Nastran, Optuna")
stable_prefix = builder.build()
# Simulate 10 requests with same stable prefix
for i in range(10):
optimizer.prepare_context(
stable_prefix=stable_prefix,
semi_stable=f"Session info for request {i}",
dynamic=f"User message {i}"
)
# Should have high cache hit rate
assert optimizer.stats.hit_rate >= 0.9 # 9/10 hits
assert optimizer.stats.estimated_savings_percent >= 80 # Good savings
class TestReflectorLearningPatterns:
"""Test that the reflector extracts useful patterns."""
def test_convergence_pattern_learning(self, tmp_path):
"""Test learning from convergence failures."""
playbook = AtomizerPlaybook()
reflector = AtomizerReflector(playbook)
# Simulate convergence failures
for i in range(5):
outcome = OptimizationOutcome(
trial_number=i,
success=False,
objective_value=None,
solver_errors=["Convergence failure at iteration 100"],
design_variables={"x": i * 0.1},
duration_seconds=300
)
reflector.analyze_trial(outcome)
reflector.commit_insights()
# Should have learned about convergence issues
convergence_insights = [
item for item in playbook.items.values()
if "convergence" in item.content.lower()
]
assert len(convergence_insights) > 0
def test_success_pattern_learning(self, tmp_path):
"""Test learning from successful designs."""
playbook = AtomizerPlaybook()
reflector = AtomizerReflector(playbook)
# Simulate successful designs with similar characteristics
for i in range(5):
outcome = OptimizationOutcome(
trial_number=i,
success=True,
objective_value=50 + i,
design_variables={
"thickness": 1.0 + i * 0.1, # All around 1.0-1.5
"width": 10.0 # Consistent
},
duration_seconds=60
)
reflector.analyze_trial(outcome)
reflector.commit_insights()
# Should have learned success patterns
success_insights = [
item for item in playbook.items.values()
if item.category == InsightCategory.STRATEGY
]
assert len(success_insights) > 0
class TestErrorTrackerIntegration:
"""Test error tracker plugin integration."""
def test_error_classification(self):
"""Test error classification function."""
from optimization_engine.plugins.post_solve.error_tracker import classify_error
assert classify_error("Convergence failure at iteration 50") == "convergence_failure"
assert classify_error("Element distortion detected") == "mesh_error"
assert classify_error("Matrix singularity") == "singularity"
assert classify_error("Out of memory") == "memory_error"
assert classify_error("License checkout failed") == "license_error"
assert classify_error("Random unknown error") == "unknown_error"
def test_error_tracking_hook(self, tmp_path):
"""Test the error tracking hook function."""
from optimization_engine.plugins.post_solve.error_tracker import track_error
context = {
"trial_number": 5,
"working_dir": str(tmp_path),
"output_dir": str(tmp_path),
"solver_returncode": 1,
"error_message": "Convergence failure at iteration 100",
"design_variables": {"x": 1.0, "y": 2.0}
}
result = track_error(context)
assert result["error_tracked"] is True
assert result["error_type"] == "convergence_failure"
# Should have created error log
error_log = tmp_path / "error_history.jsonl"
assert error_log.exists()
# Verify log content
with open(error_log) as f:
log_entry = json.loads(f.readline())
assert log_entry["trial"] == 5
assert log_entry["error_type"] == "convergence_failure"
class TestPlaybookContextGeneration:
"""Test context generation for different scenarios."""
def test_context_for_optimization_task(self):
"""Test context generation for optimization."""
playbook = AtomizerPlaybook()
# Add various insights
playbook.add_insight(InsightCategory.STRATEGY, "Start with coarse mesh")
playbook.add_insight(InsightCategory.MISTAKE, "Avoid tiny elements")
playbook.add_insight(InsightCategory.TOOL, "Use TPE for exploration")
# Give them different scores
playbook.record_outcome("str-00001", helpful=True)
playbook.record_outcome("str-00001", helpful=True)
context = playbook.get_context_for_task("optimization", max_items=10)
assert "Playbook" in context
assert "STRATEGY" in context
assert "coarse mesh" in context
def test_context_filtering_by_confidence(self):
"""Test that low-confidence items are filtered."""
playbook = AtomizerPlaybook()
# Add item with low confidence
item = playbook.add_insight(InsightCategory.STRATEGY, "Questionable advice")
playbook.record_outcome(item.id, helpful=True)
playbook.record_outcome(item.id, helpful=False)
playbook.record_outcome(item.id, helpful=False)
playbook.record_outcome(item.id, helpful=False)
# confidence = 1/4 = 0.25
# High min_confidence should exclude it
context = playbook.get_context_for_task(
"optimization",
min_confidence=0.5
)
assert "Questionable advice" not in context
def test_context_ordering_by_score(self):
"""Test that items are ordered by net score."""
playbook = AtomizerPlaybook()
# Add items with different scores
low = playbook.add_insight(InsightCategory.STRATEGY, "Low score advice")
high = playbook.add_insight(InsightCategory.STRATEGY, "High score advice")
# Give high item better score
for _ in range(5):
playbook.record_outcome(high.id, helpful=True)
playbook.record_outcome(low.id, helpful=True)
context = playbook.get_context_for_task("optimization")
# High score should appear first
high_pos = context.find("High score")
low_pos = context.find("Low score")
assert high_pos < low_pos
if __name__ == "__main__":
pytest.main([__file__, "-v"])