Atomizer/tests/test_context_integration.py

"""
Integration test for full context engineering pipeline.

Tests the complete ACE (Agentic Context Engineering) workflow:
1. Starting fresh session
2. Running optimization with successes and failures
3. Verifying playbook learns from outcomes
4. Validating persistence across sessions
5. Testing context compaction under load
"""

import pytest
from pathlib import Path
import tempfile
import json
from datetime import datetime, timedelta
import random

from optimization_engine.context.playbook import AtomizerPlaybook, InsightCategory
from optimization_engine.context.reflector import AtomizerReflector, OptimizationOutcome
from optimization_engine.context.session_state import AtomizerSessionState, TaskType
from optimization_engine.context.feedback_loop import FeedbackLoop
from optimization_engine.context.compaction import CompactionManager, EventType
from optimization_engine.context.cache_monitor import ContextCacheOptimizer, StablePrefixBuilder


class TestFullOptimizationPipeline:
    """End-to-end test of optimization with context engineering."""

    def test_complete_optimization_cycle(self, tmp_path):
        """
        Simulates a complete optimization run:
        1. Initialize context engineering
        2. Process multiple trials (mix of success/failure)
        3. Finalize and commit learning
        4. Verify playbook has learned
        """
        playbook_path = tmp_path / "playbook.json"

        # Initialize feedback loop
        feedback = FeedbackLoop(playbook_path)

        # Simulate study with mixed results
        trial_results = []
        for i in range(20):
            success = random.random() > 0.3  # 70% success rate
            obj_value = 100 - i * 2 + random.uniform(-5, 5) if success else None

            result = feedback.process_trial_result(
                trial_number=i,
                success=success,
                objective_value=obj_value if success else 0.0,
                design_variables={
                    "thickness": 0.5 + i * 0.1,
                    "width": 10 + i * 0.5
                },
                context_items_used=[],
                errors=["convergence failure"] if not success else None
            )

            trial_results.append({
                "trial": i,
                "success": success,
                "insights": result.get("insights_extracted", 0)
            })

        # Finalize study
        successful = sum(1 for r in trial_results if r["success"])
        final_result = feedback.finalize_study({
            "name": "integration_test_study",
            "total_trials": 20,
            "best_value": min(
                r.get("objective_value", float('inf'))
                for r in trial_results if r["success"]
            ) if successful > 0 else 0,
            "convergence_rate": successful / 20
        })

        # Verify learning occurred
        assert final_result["insights_added"] > 0
        assert final_result["playbook_size"] > 0
        assert playbook_path.exists()

        # Load and verify playbook content
        playbook = AtomizerPlaybook.load(playbook_path)

        # Should have some mistake insights from failures
        mistakes = [
            item for item in playbook.items.values()
            if item.category == InsightCategory.MISTAKE
        ]
        assert len(mistakes) > 0

    def test_learning_persistence_across_sessions(self, tmp_path):
        """
        Test that learning persists across multiple "sessions".
        """
        playbook_path = tmp_path / "playbook.json"

        # Session 1: Generate initial learning
        feedback1 = FeedbackLoop(playbook_path)
        for i in range(10):
            feedback1.process_trial_result(
                trial_number=i,
                success=True,
                objective_value=100 - i,
                design_variables={"x": i}
            )
        feedback1.finalize_study({
            "name": "session1",
            "total_trials": 10,
            "best_value": 91,
            "convergence_rate": 1.0
        })

        # Verify session 1 created insights
        pb1 = AtomizerPlaybook.load(playbook_path)
        session1_items = len(pb1.items)
        assert session1_items > 0

        # Session 2: Continue learning
        feedback2 = FeedbackLoop(playbook_path)

        # Should have loaded existing playbook
        assert len(feedback2.playbook.items) == session1_items

        # Add more trials
        for i in range(10, 20):
            feedback2.process_trial_result(
                trial_number=i,
                success=i % 2 == 0,
                objective_value=100 - i if i % 2 == 0 else 0.0,
                design_variables={"x": i},
                errors=["test error"] if i % 2 != 0 else None
            )
        feedback2.finalize_study({
            "name": "session2",
            "total_trials": 10,
            "best_value": 80,
            "convergence_rate": 0.5
        })

        # Verify combined learning
        pb2 = AtomizerPlaybook.load(playbook_path)
        assert len(pb2.items) >= session1_items  # At least as many items

    def test_playbook_pruning_over_time(self, tmp_path):
        """
        Test that harmful insights get pruned.
        """
        playbook_path = tmp_path / "playbook.json"

        # Create playbook with a "bad" insight
        playbook = AtomizerPlaybook()
        bad_item = playbook.add_insight(
            InsightCategory.STRATEGY,
            "Use extremely coarse mesh"  # Bad advice
        )

        # Give it many harmful outcomes
        for _ in range(10):
            playbook.record_outcome(bad_item.id, helpful=False)

        playbook.save(playbook_path)

        # Create feedback loop and finalize
        feedback = FeedbackLoop(playbook_path)

        # Process a few trials
        for i in range(5):
            feedback.process_trial_result(
                trial_number=i,
                success=True,
                objective_value=100,
                design_variables={}
            )

        feedback.finalize_study({
            "name": "prune_test",
            "total_trials": 5,
            "best_value": 100,
            "convergence_rate": 1.0
        })

        # Bad insight should be pruned (net_score -10 < threshold -3)
        final_playbook = AtomizerPlaybook.load(playbook_path)
        assert bad_item.id not in final_playbook.items

    def test_context_compaction_under_load(self, tmp_path):
        """
        Test that compaction works correctly under high trial volume.
        """
        manager = CompactionManager(
            compaction_threshold=20,
            keep_recent=10,
            keep_errors=True
        )

        # Simulate 100 trials
        errors_added = 0
        for i in range(100):
            success = i % 5 != 0

            if success:
                manager.add_trial_event(
                    trial_number=i,
                    success=True,
                    objective=100 - i * 0.5,
                    duration=random.uniform(30, 120)
                )
            else:
                manager.add_trial_event(
                    trial_number=i,
                    success=False,
                    duration=random.uniform(30, 120)
                )
                manager.add_error_event(
                    f"Error in trial {i}",
                    error_type="test_error"
                )
                errors_added += 1

        # Should have compacted
        stats = manager.get_stats()
        assert stats["compaction_count"] > 0

        # All errors should be preserved
        assert stats["error_events"] == errors_added

        # Total events should be bounded
        assert stats["total_events"] < 100  # Compaction reduced count

        # Context string should be reasonable length
        context = manager.get_context_string()
        assert len(context) < 50000  # Not too long

    def test_session_state_throughout_optimization(self, tmp_path):
        """
        Test session state tracking throughout an optimization.
        """
        session = AtomizerSessionState(session_id="integration_test")
        session.exposed.task_type = TaskType.RUN_OPTIMIZATION
        session.exposed.study_name = "state_test"

        # Simulate optimization progress
        for i in range(20):
            session.add_action(f"Processing trial {i}")

            if i % 5 == 0 and i > 0:
                session.update_study_status(
                    name="state_test",
                    status="running",
                    trials_completed=i,
                    trials_total=20,
                    best_value=100 - i,
                    best_trial=i
                )

            if i % 7 == 0:
                session.add_error(f"Minor issue at trial {i}")

        # Verify state
        assert session.exposed.trials_completed == 15  # Last update at i=15
        assert len(session.exposed.recent_errors) <= 5  # Bounded

        # Context should include key information
        context = session.get_llm_context()
        assert "state_test" in context
        assert "running" in context

    def test_cache_optimization_effectiveness(self):
        """
        Test that cache optimization actually works.
        """
        optimizer = ContextCacheOptimizer()

        # Build stable prefix (should be cached)
        builder = StablePrefixBuilder()
        builder.add_identity("I am Atomizer, an optimization assistant")
        builder.add_capabilities("I can run FEA optimizations")
        builder.add_tools("Available tools: NX, Nastran, Optuna")
        stable_prefix = builder.build()

        # Simulate 10 requests with same stable prefix
        for i in range(10):
            optimizer.prepare_context(
                stable_prefix=stable_prefix,
                semi_stable=f"Session info for request {i}",
                dynamic=f"User message {i}"
            )

        # Should have high cache hit rate
        assert optimizer.stats.hit_rate >= 0.9  # 9/10 hits
        assert optimizer.stats.estimated_savings_percent >= 80  # Good savings


class TestReflectorLearningPatterns:
    """Test that the reflector extracts useful patterns."""

    def test_convergence_pattern_learning(self, tmp_path):
        """Test learning from convergence failures."""
        playbook = AtomizerPlaybook()
        reflector = AtomizerReflector(playbook)

        # Simulate convergence failures
        for i in range(5):
            outcome = OptimizationOutcome(
                trial_number=i,
                success=False,
                objective_value=None,
                solver_errors=["Convergence failure at iteration 100"],
                design_variables={"x": i * 0.1},
                duration_seconds=300
            )
            reflector.analyze_trial(outcome)

        reflector.commit_insights()

        # Should have learned about convergence issues
        convergence_insights = [
            item for item in playbook.items.values()
            if "convergence" in item.content.lower()
        ]
        assert len(convergence_insights) > 0

    def test_success_pattern_learning(self, tmp_path):
        """Test learning from successful designs."""
        playbook = AtomizerPlaybook()
        reflector = AtomizerReflector(playbook)

        # Simulate successful designs with similar characteristics
        for i in range(5):
            outcome = OptimizationOutcome(
                trial_number=i,
                success=True,
                objective_value=50 + i,
                design_variables={
                    "thickness": 1.0 + i * 0.1,  # All around 1.0-1.5
                    "width": 10.0  # Consistent
                },
                duration_seconds=60
            )
            reflector.analyze_trial(outcome)

        reflector.commit_insights()

        # Should have learned success patterns
        success_insights = [
            item for item in playbook.items.values()
            if item.category == InsightCategory.STRATEGY
        ]
        assert len(success_insights) > 0


class TestErrorTrackerIntegration:
    """Test error tracker plugin integration."""

    def test_error_classification(self):
        """Test error classification function."""
        from optimization_engine.plugins.post_solve.error_tracker import classify_error

        assert classify_error("Convergence failure at iteration 50") == "convergence_failure"
        assert classify_error("Element distortion detected") == "mesh_error"
        assert classify_error("Matrix singularity") == "singularity"
        assert classify_error("Out of memory") == "memory_error"
        assert classify_error("License checkout failed") == "license_error"
        assert classify_error("Random unknown error") == "unknown_error"

    def test_error_tracking_hook(self, tmp_path):
        """Test the error tracking hook function."""
        from optimization_engine.plugins.post_solve.error_tracker import track_error

        context = {
            "trial_number": 5,
            "working_dir": str(tmp_path),
            "output_dir": str(tmp_path),
            "solver_returncode": 1,
            "error_message": "Convergence failure at iteration 100",
            "design_variables": {"x": 1.0, "y": 2.0}
        }

        result = track_error(context)

        assert result["error_tracked"] is True
        assert result["error_type"] == "convergence_failure"

        # Should have created error log
        error_log = tmp_path / "error_history.jsonl"
        assert error_log.exists()

        # Verify log content
        with open(error_log) as f:
            log_entry = json.loads(f.readline())

        assert log_entry["trial"] == 5
        assert log_entry["error_type"] == "convergence_failure"


class TestPlaybookContextGeneration:
    """Test context generation for different scenarios."""

    def test_context_for_optimization_task(self):
        """Test context generation for optimization."""
        playbook = AtomizerPlaybook()

        # Add various insights
        playbook.add_insight(InsightCategory.STRATEGY, "Start with coarse mesh")
        playbook.add_insight(InsightCategory.MISTAKE, "Avoid tiny elements")
        playbook.add_insight(InsightCategory.TOOL, "Use TPE for exploration")

        # Give them different scores
        playbook.record_outcome("str-00001", helpful=True)
        playbook.record_outcome("str-00001", helpful=True)

        context = playbook.get_context_for_task("optimization", max_items=10)

        assert "Playbook" in context
        assert "STRATEGY" in context
        assert "coarse mesh" in context

    def test_context_filtering_by_confidence(self):
        """Test that low-confidence items are filtered."""
        playbook = AtomizerPlaybook()

        # Add item with low confidence
        item = playbook.add_insight(InsightCategory.STRATEGY, "Questionable advice")
        playbook.record_outcome(item.id, helpful=True)
        playbook.record_outcome(item.id, helpful=False)
        playbook.record_outcome(item.id, helpful=False)
        playbook.record_outcome(item.id, helpful=False)
        # confidence = 1/4 = 0.25

        # High min_confidence should exclude it
        context = playbook.get_context_for_task(
            "optimization",
            min_confidence=0.5
        )

        assert "Questionable advice" not in context

    def test_context_ordering_by_score(self):
        """Test that items are ordered by net score."""
        playbook = AtomizerPlaybook()

        # Add items with different scores
        low = playbook.add_insight(InsightCategory.STRATEGY, "Low score advice")
        high = playbook.add_insight(InsightCategory.STRATEGY, "High score advice")

        # Give high item better score
        for _ in range(5):
            playbook.record_outcome(high.id, helpful=True)
        playbook.record_outcome(low.id, helpful=True)

        context = playbook.get_context_for_task("optimization")

        # High score should appear first
        high_pos = context.find("High score")
        low_pos = context.find("Low score")
        assert high_pos < low_pos


if __name__ == "__main__":
    pytest.main([__file__, "-v"])