""" Integration test for full context engineering pipeline. Tests the complete ACE (Agentic Context Engineering) workflow: 1. Starting fresh session 2. Running optimization with successes and failures 3. Verifying playbook learns from outcomes 4. Validating persistence across sessions 5. Testing context compaction under load """ import pytest from pathlib import Path import tempfile import json from datetime import datetime, timedelta import random from optimization_engine.context.playbook import AtomizerPlaybook, InsightCategory from optimization_engine.context.reflector import AtomizerReflector, OptimizationOutcome from optimization_engine.context.session_state import AtomizerSessionState, TaskType from optimization_engine.context.feedback_loop import FeedbackLoop from optimization_engine.context.compaction import CompactionManager, EventType from optimization_engine.context.cache_monitor import ContextCacheOptimizer, StablePrefixBuilder class TestFullOptimizationPipeline: """End-to-end test of optimization with context engineering.""" def test_complete_optimization_cycle(self, tmp_path): """ Simulates a complete optimization run: 1. Initialize context engineering 2. Process multiple trials (mix of success/failure) 3. Finalize and commit learning 4. Verify playbook has learned """ playbook_path = tmp_path / "playbook.json" # Initialize feedback loop feedback = FeedbackLoop(playbook_path) # Simulate study with mixed results trial_results = [] for i in range(20): success = random.random() > 0.3 # 70% success rate obj_value = 100 - i * 2 + random.uniform(-5, 5) if success else None result = feedback.process_trial_result( trial_number=i, success=success, objective_value=obj_value if success else 0.0, design_variables={ "thickness": 0.5 + i * 0.1, "width": 10 + i * 0.5 }, context_items_used=[], errors=["convergence failure"] if not success else None ) trial_results.append({ "trial": i, "success": success, "insights": result.get("insights_extracted", 0) }) # Finalize study successful = sum(1 for r in trial_results if r["success"]) final_result = feedback.finalize_study({ "name": "integration_test_study", "total_trials": 20, "best_value": min( r.get("objective_value", float('inf')) for r in trial_results if r["success"] ) if successful > 0 else 0, "convergence_rate": successful / 20 }) # Verify learning occurred assert final_result["insights_added"] > 0 assert final_result["playbook_size"] > 0 assert playbook_path.exists() # Load and verify playbook content playbook = AtomizerPlaybook.load(playbook_path) # Should have some mistake insights from failures mistakes = [ item for item in playbook.items.values() if item.category == InsightCategory.MISTAKE ] assert len(mistakes) > 0 def test_learning_persistence_across_sessions(self, tmp_path): """ Test that learning persists across multiple "sessions". """ playbook_path = tmp_path / "playbook.json" # Session 1: Generate initial learning feedback1 = FeedbackLoop(playbook_path) for i in range(10): feedback1.process_trial_result( trial_number=i, success=True, objective_value=100 - i, design_variables={"x": i} ) feedback1.finalize_study({ "name": "session1", "total_trials": 10, "best_value": 91, "convergence_rate": 1.0 }) # Verify session 1 created insights pb1 = AtomizerPlaybook.load(playbook_path) session1_items = len(pb1.items) assert session1_items > 0 # Session 2: Continue learning feedback2 = FeedbackLoop(playbook_path) # Should have loaded existing playbook assert len(feedback2.playbook.items) == session1_items # Add more trials for i in range(10, 20): feedback2.process_trial_result( trial_number=i, success=i % 2 == 0, objective_value=100 - i if i % 2 == 0 else 0.0, design_variables={"x": i}, errors=["test error"] if i % 2 != 0 else None ) feedback2.finalize_study({ "name": "session2", "total_trials": 10, "best_value": 80, "convergence_rate": 0.5 }) # Verify combined learning pb2 = AtomizerPlaybook.load(playbook_path) assert len(pb2.items) >= session1_items # At least as many items def test_playbook_pruning_over_time(self, tmp_path): """ Test that harmful insights get pruned. """ playbook_path = tmp_path / "playbook.json" # Create playbook with a "bad" insight playbook = AtomizerPlaybook() bad_item = playbook.add_insight( InsightCategory.STRATEGY, "Use extremely coarse mesh" # Bad advice ) # Give it many harmful outcomes for _ in range(10): playbook.record_outcome(bad_item.id, helpful=False) playbook.save(playbook_path) # Create feedback loop and finalize feedback = FeedbackLoop(playbook_path) # Process a few trials for i in range(5): feedback.process_trial_result( trial_number=i, success=True, objective_value=100, design_variables={} ) feedback.finalize_study({ "name": "prune_test", "total_trials": 5, "best_value": 100, "convergence_rate": 1.0 }) # Bad insight should be pruned (net_score -10 < threshold -3) final_playbook = AtomizerPlaybook.load(playbook_path) assert bad_item.id not in final_playbook.items def test_context_compaction_under_load(self, tmp_path): """ Test that compaction works correctly under high trial volume. """ manager = CompactionManager( compaction_threshold=20, keep_recent=10, keep_errors=True ) # Simulate 100 trials errors_added = 0 for i in range(100): success = i % 5 != 0 if success: manager.add_trial_event( trial_number=i, success=True, objective=100 - i * 0.5, duration=random.uniform(30, 120) ) else: manager.add_trial_event( trial_number=i, success=False, duration=random.uniform(30, 120) ) manager.add_error_event( f"Error in trial {i}", error_type="test_error" ) errors_added += 1 # Should have compacted stats = manager.get_stats() assert stats["compaction_count"] > 0 # All errors should be preserved assert stats["error_events"] == errors_added # Total events should be bounded assert stats["total_events"] < 100 # Compaction reduced count # Context string should be reasonable length context = manager.get_context_string() assert len(context) < 50000 # Not too long def test_session_state_throughout_optimization(self, tmp_path): """ Test session state tracking throughout an optimization. """ session = AtomizerSessionState(session_id="integration_test") session.exposed.task_type = TaskType.RUN_OPTIMIZATION session.exposed.study_name = "state_test" # Simulate optimization progress for i in range(20): session.add_action(f"Processing trial {i}") if i % 5 == 0 and i > 0: session.update_study_status( name="state_test", status="running", trials_completed=i, trials_total=20, best_value=100 - i, best_trial=i ) if i % 7 == 0: session.add_error(f"Minor issue at trial {i}") # Verify state assert session.exposed.trials_completed == 15 # Last update at i=15 assert len(session.exposed.recent_errors) <= 5 # Bounded # Context should include key information context = session.get_llm_context() assert "state_test" in context assert "running" in context def test_cache_optimization_effectiveness(self): """ Test that cache optimization actually works. """ optimizer = ContextCacheOptimizer() # Build stable prefix (should be cached) builder = StablePrefixBuilder() builder.add_identity("I am Atomizer, an optimization assistant") builder.add_capabilities("I can run FEA optimizations") builder.add_tools("Available tools: NX, Nastran, Optuna") stable_prefix = builder.build() # Simulate 10 requests with same stable prefix for i in range(10): optimizer.prepare_context( stable_prefix=stable_prefix, semi_stable=f"Session info for request {i}", dynamic=f"User message {i}" ) # Should have high cache hit rate assert optimizer.stats.hit_rate >= 0.9 # 9/10 hits assert optimizer.stats.estimated_savings_percent >= 80 # Good savings class TestReflectorLearningPatterns: """Test that the reflector extracts useful patterns.""" def test_convergence_pattern_learning(self, tmp_path): """Test learning from convergence failures.""" playbook = AtomizerPlaybook() reflector = AtomizerReflector(playbook) # Simulate convergence failures for i in range(5): outcome = OptimizationOutcome( trial_number=i, success=False, objective_value=None, solver_errors=["Convergence failure at iteration 100"], design_variables={"x": i * 0.1}, duration_seconds=300 ) reflector.analyze_trial(outcome) reflector.commit_insights() # Should have learned about convergence issues convergence_insights = [ item for item in playbook.items.values() if "convergence" in item.content.lower() ] assert len(convergence_insights) > 0 def test_success_pattern_learning(self, tmp_path): """Test learning from successful designs.""" playbook = AtomizerPlaybook() reflector = AtomizerReflector(playbook) # Simulate successful designs with similar characteristics for i in range(5): outcome = OptimizationOutcome( trial_number=i, success=True, objective_value=50 + i, design_variables={ "thickness": 1.0 + i * 0.1, # All around 1.0-1.5 "width": 10.0 # Consistent }, duration_seconds=60 ) reflector.analyze_trial(outcome) reflector.commit_insights() # Should have learned success patterns success_insights = [ item for item in playbook.items.values() if item.category == InsightCategory.STRATEGY ] assert len(success_insights) > 0 class TestErrorTrackerIntegration: """Test error tracker plugin integration.""" def test_error_classification(self): """Test error classification function.""" from optimization_engine.plugins.post_solve.error_tracker import classify_error assert classify_error("Convergence failure at iteration 50") == "convergence_failure" assert classify_error("Element distortion detected") == "mesh_error" assert classify_error("Matrix singularity") == "singularity" assert classify_error("Out of memory") == "memory_error" assert classify_error("License checkout failed") == "license_error" assert classify_error("Random unknown error") == "unknown_error" def test_error_tracking_hook(self, tmp_path): """Test the error tracking hook function.""" from optimization_engine.plugins.post_solve.error_tracker import track_error context = { "trial_number": 5, "working_dir": str(tmp_path), "output_dir": str(tmp_path), "solver_returncode": 1, "error_message": "Convergence failure at iteration 100", "design_variables": {"x": 1.0, "y": 2.0} } result = track_error(context) assert result["error_tracked"] is True assert result["error_type"] == "convergence_failure" # Should have created error log error_log = tmp_path / "error_history.jsonl" assert error_log.exists() # Verify log content with open(error_log) as f: log_entry = json.loads(f.readline()) assert log_entry["trial"] == 5 assert log_entry["error_type"] == "convergence_failure" class TestPlaybookContextGeneration: """Test context generation for different scenarios.""" def test_context_for_optimization_task(self): """Test context generation for optimization.""" playbook = AtomizerPlaybook() # Add various insights playbook.add_insight(InsightCategory.STRATEGY, "Start with coarse mesh") playbook.add_insight(InsightCategory.MISTAKE, "Avoid tiny elements") playbook.add_insight(InsightCategory.TOOL, "Use TPE for exploration") # Give them different scores playbook.record_outcome("str-00001", helpful=True) playbook.record_outcome("str-00001", helpful=True) context = playbook.get_context_for_task("optimization", max_items=10) assert "Playbook" in context assert "STRATEGY" in context assert "coarse mesh" in context def test_context_filtering_by_confidence(self): """Test that low-confidence items are filtered.""" playbook = AtomizerPlaybook() # Add item with low confidence item = playbook.add_insight(InsightCategory.STRATEGY, "Questionable advice") playbook.record_outcome(item.id, helpful=True) playbook.record_outcome(item.id, helpful=False) playbook.record_outcome(item.id, helpful=False) playbook.record_outcome(item.id, helpful=False) # confidence = 1/4 = 0.25 # High min_confidence should exclude it context = playbook.get_context_for_task( "optimization", min_confidence=0.5 ) assert "Questionable advice" not in context def test_context_ordering_by_score(self): """Test that items are ordered by net score.""" playbook = AtomizerPlaybook() # Add items with different scores low = playbook.add_insight(InsightCategory.STRATEGY, "Low score advice") high = playbook.add_insight(InsightCategory.STRATEGY, "High score advice") # Give high item better score for _ in range(5): playbook.record_outcome(high.id, helpful=True) playbook.record_outcome(low.id, helpful=True) context = playbook.get_context_for_task("optimization") # High score should appear first high_pos = context.find("High score") low_pos = context.find("Low score") assert high_pos < low_pos if __name__ == "__main__": pytest.main([__file__, "-v"])