tests/test_context_engineering.py

"""
Test suite for context engineering components.

Tests the ACE (Agentic Context Engineering) implementation:
- Playbook: Knowledge store with helpful/harmful tracking
- Reflector: Outcome analysis and insight extraction
- SessionState: Context isolation
- Compaction: Long-running session management
- FeedbackLoop: Automated learning
"""

import pytest
from pathlib import Path
import tempfile
import json
from datetime import datetime

from optimization_engine.context.playbook import (
    AtomizerPlaybook,
    PlaybookItem,
    InsightCategory
)
from optimization_engine.context.reflector import (
    AtomizerReflector,
    OptimizationOutcome
)
from optimization_engine.context.session_state import (
    AtomizerSessionState,
    TaskType,
    ExposedState,
    IsolatedState
)
from optimization_engine.context.compaction import (
    CompactionManager,
    ContextEvent,
    EventType,
    ContextBudgetManager
)
from optimization_engine.context.cache_monitor import (
    ContextCacheOptimizer,
    CacheStats,
    StablePrefixBuilder
)
from optimization_engine.context.feedback_loop import (
    FeedbackLoop
)


class TestAtomizerPlaybook:
    """Tests for the playbook system."""

    def test_create_empty_playbook(self):
        """Test creating an empty playbook."""
        playbook = AtomizerPlaybook()
        assert len(playbook.items) == 0
        assert playbook.version == 1

    def test_add_insight(self):
        """Test adding insights to playbook."""
        playbook = AtomizerPlaybook()

        item = playbook.add_insight(
            category=InsightCategory.STRATEGY,
            content="Use shell elements for thin walls",
            source_trial=1
        )

        assert item.id == "str-00001"
        assert item.helpful_count == 0
        assert item.harmful_count == 0
        assert item.category == InsightCategory.STRATEGY
        assert len(playbook.items) == 1
        assert 1 in item.source_trials

    def test_add_multiple_categories(self):
        """Test adding insights across different categories."""
        playbook = AtomizerPlaybook()

        playbook.add_insight(InsightCategory.STRATEGY, "Strategy 1")
        playbook.add_insight(InsightCategory.MISTAKE, "Mistake 1")
        playbook.add_insight(InsightCategory.TOOL, "Tool tip 1")
        playbook.add_insight(InsightCategory.STRATEGY, "Strategy 2")

        assert len(playbook.items) == 4
        assert "str-00001" in playbook.items
        assert "str-00002" in playbook.items
        assert "mis-00001" in playbook.items
        assert "tool-00001" in playbook.items

    def test_deduplication(self):
        """Test that duplicate insights are merged."""
        playbook = AtomizerPlaybook()

        item1 = playbook.add_insight(InsightCategory.STRATEGY, "Use shell elements")
        item2 = playbook.add_insight(InsightCategory.STRATEGY, "Use shell elements")

        # Should merge into one item
        assert len(playbook.items) == 1
        # Helpful count incremented on duplicate
        assert item2.helpful_count == 1
        assert item1 is item2  # Same object

    def test_outcome_tracking(self):
        """Test helpful/harmful tracking."""
        playbook = AtomizerPlaybook()
        item = playbook.add_insight(InsightCategory.STRATEGY, "Test insight")

        playbook.record_outcome(item.id, helpful=True)
        playbook.record_outcome(item.id, helpful=True)
        playbook.record_outcome(item.id, helpful=False)

        assert item.helpful_count == 2
        assert item.harmful_count == 1
        assert item.net_score == 1
        assert item.confidence == 2/3

    def test_confidence_calculation(self):
        """Test confidence score calculation."""
        playbook = AtomizerPlaybook()
        item = playbook.add_insight(InsightCategory.STRATEGY, "Test")

        # Initial confidence is 0.5 (neutral)
        assert item.confidence == 0.5

        # After positive feedback
        playbook.record_outcome(item.id, helpful=True)
        assert item.confidence == 1.0

        # After mixed feedback
        playbook.record_outcome(item.id, helpful=False)
        assert item.confidence == 0.5

    def test_persistence(self, tmp_path):
        """Test save/load cycle."""
        playbook = AtomizerPlaybook()
        playbook.add_insight(InsightCategory.MISTAKE, "Don't do this", tags=["test"])
        playbook.add_insight(InsightCategory.STRATEGY, "Do this instead")

        # Record some outcomes
        playbook.record_outcome("mis-00001", helpful=False)
        playbook.record_outcome("str-00001", helpful=True)

        save_path = tmp_path / "playbook.json"
        playbook.save(save_path)

        # Load and verify
        loaded = AtomizerPlaybook.load(save_path)
        assert len(loaded.items) == 2
        assert "mis-00001" in loaded.items
        assert loaded.items["mis-00001"].harmful_count == 1
        assert loaded.items["str-00001"].helpful_count == 1
        assert "test" in loaded.items["mis-00001"].tags

    def test_pruning(self):
        """Test harmful item pruning."""
        playbook = AtomizerPlaybook()
        item = playbook.add_insight(InsightCategory.STRATEGY, "Bad advice")

        # Record many harmful outcomes
        for _ in range(5):
            playbook.record_outcome(item.id, helpful=False)

        assert item.net_score == -5

        # Prune with threshold -3
        removed = playbook.prune_harmful(threshold=-3)

        assert removed == 1
        assert len(playbook.items) == 0

    def test_search_by_content(self):
        """Test content search functionality."""
        playbook = AtomizerPlaybook()
        playbook.add_insight(InsightCategory.STRATEGY, "Use shell elements for thin walls")
        playbook.add_insight(InsightCategory.STRATEGY, "Solid elements for thick parts")
        playbook.add_insight(InsightCategory.MISTAKE, "Don't use coarse mesh")

        results = playbook.search_by_content("shell elements")
        assert len(results) >= 1
        assert "shell" in results[0].content.lower()

    def test_get_context_for_task(self):
        """Test context string generation."""
        playbook = AtomizerPlaybook()
        playbook.add_insight(InsightCategory.STRATEGY, "Strategy 1")
        playbook.add_insight(InsightCategory.MISTAKE, "Mistake 1")

        # Make strategy have higher score
        playbook.record_outcome("str-00001", helpful=True)
        playbook.record_outcome("str-00001", helpful=True)

        context = playbook.get_context_for_task("optimization")

        assert "Playbook" in context
        assert "str-00001" in context
        assert "helpful=2" in context


class TestAtomizerReflector:
    """Tests for the reflector component."""

    def test_create_reflector(self):
        """Test creating a reflector."""
        playbook = AtomizerPlaybook()
        reflector = AtomizerReflector(playbook)

        assert reflector.playbook is playbook
        assert len(reflector.pending_insights) == 0

    def test_analyze_successful_trial(self):
        """Test analysis of successful trial."""
        playbook = AtomizerPlaybook()
        reflector = AtomizerReflector(playbook)

        outcome = OptimizationOutcome(
            trial_number=1,
            success=True,
            objective_value=100.0,
            constraint_violations=[],
            solver_errors=[],
            design_variables={"thickness": 1.0, "width": 5.0},
            extractor_used="mass_extractor",
            duration_seconds=60
        )

        insights = reflector.analyze_trial(outcome)

        # Should extract success pattern
        assert len(insights) >= 1
        assert any(i.helpful for i in insights)
        assert 1 in reflector.analyzed_trials

    def test_analyze_failed_trial(self):
        """Test analysis of failed trial."""
        playbook = AtomizerPlaybook()
        reflector = AtomizerReflector(playbook)

        outcome = OptimizationOutcome(
            trial_number=1,
            success=False,
            objective_value=None,
            constraint_violations=["stress > 250 MPa"],
            solver_errors=["convergence failure at iteration 50"],
            design_variables={"thickness": 0.5},
            extractor_used="stress_extractor",
            duration_seconds=120
        )

        insights = reflector.analyze_trial(outcome)

        # Should extract failure patterns
        assert len(insights) >= 2  # At least error + constraint
        assert any(i.category == InsightCategory.MISTAKE for i in insights)
        assert not any(i.helpful for i in insights if i.category == InsightCategory.MISTAKE)

    def test_analyze_mesh_error(self):
        """Test analysis of mesh-related error."""
        playbook = AtomizerPlaybook()
        reflector = AtomizerReflector(playbook)

        outcome = OptimizationOutcome(
            trial_number=5,
            success=False,
            objective_value=None,
            constraint_violations=[],
            solver_errors=["Element distortion: negative jacobian detected"],
            design_variables={},
            extractor_used="",
            duration_seconds=30
        )

        insights = reflector.analyze_trial(outcome)

        # Should identify mesh error
        assert any("mesh" in str(i.tags).lower() for i in insights)

    def test_commit_insights(self):
        """Test committing insights to playbook."""
        playbook = AtomizerPlaybook()
        reflector = AtomizerReflector(playbook)

        outcome = OptimizationOutcome(
            trial_number=1,
            success=True,
            objective_value=100.0,
            constraint_violations=[],
            solver_errors=[],
            design_variables={"thickness": 1.0},
            extractor_used="mass_extractor",
            duration_seconds=60
        )

        reflector.analyze_trial(outcome)
        count = reflector.commit_insights()

        assert count > 0
        assert len(playbook.items) > 0
        assert len(reflector.pending_insights) == 0  # Cleared after commit

    def test_analyze_study_completion(self):
        """Test study-level analysis."""
        playbook = AtomizerPlaybook()
        reflector = AtomizerReflector(playbook)

        # High success rate study
        insights = reflector.analyze_study_completion(
            study_name="test_study",
            total_trials=100,
            best_value=50.0,
            convergence_rate=0.95,
            method="TPE"
        )

        assert len(insights) >= 1
        assert any("robust" in i.content.lower() for i in insights)


class TestSessionState:
    """Tests for session state management."""

    def test_create_session(self):
        """Test creating a session."""
        session = AtomizerSessionState(session_id="test_session")

        assert session.session_id == "test_session"
        assert session.exposed.task_type is None
        assert len(session.exposed.recent_actions) == 0

    def test_set_task_type(self):
        """Test setting task type."""
        session = AtomizerSessionState(session_id="test")
        session.exposed.task_type = TaskType.CREATE_STUDY

        assert session.exposed.task_type == TaskType.CREATE_STUDY

    def test_add_action(self):
        """Test adding actions."""
        session = AtomizerSessionState(session_id="test")

        session.add_action("Created study directory")
        session.add_action("Configured optimization")

        assert len(session.exposed.recent_actions) == 2
        assert "Created study" in session.exposed.recent_actions[0]

    def test_action_compression(self):
        """Test automatic action compression."""
        session = AtomizerSessionState(session_id="test")

        # Add more actions than the limit
        for i in range(15):
            session.add_action(f"Action {i}")

        # Should be compressed
        assert len(session.exposed.recent_actions) <= 12
        assert any("earlier actions" in a.lower() for a in session.exposed.recent_actions)

    def test_add_error(self):
        """Test adding errors."""
        session = AtomizerSessionState(session_id="test")

        session.add_error("Solver failed", error_type="convergence")
        session.add_error("Mesh error")

        assert len(session.exposed.recent_errors) == 2
        assert "[convergence]" in session.exposed.recent_errors[0]

    def test_update_study_status(self):
        """Test updating study status."""
        session = AtomizerSessionState(session_id="test")

        session.update_study_status(
            name="bracket_opt",
            status="running",
            trials_completed=25,
            trials_total=100,
            best_value=0.5,
            best_trial=20
        )

        assert session.exposed.study_name == "bracket_opt"
        assert session.exposed.trials_completed == 25
        assert session.exposed.best_value == 0.5

    def test_llm_context_generation(self):
        """Test LLM context string generation."""
        session = AtomizerSessionState(session_id="test")
        session.exposed.task_type = TaskType.RUN_OPTIMIZATION
        session.exposed.study_name = "test_study"
        session.exposed.trials_completed = 50
        session.exposed.trials_total = 100
        session.exposed.best_value = 0.5

        context = session.get_llm_context()

        assert "test_study" in context
        assert "50" in context
        assert "0.5" in context
        assert "run_optimization" in context

    def test_isolated_state_access(self):
        """Test accessing isolated state."""
        session = AtomizerSessionState(session_id="test")
        session.isolated.nx_model_path = "/path/to/model.prt"

        # Should not appear in LLM context
        context = session.get_llm_context()
        assert "/path/to/model.prt" not in context

        # But accessible via explicit load
        path = session.load_isolated_data("nx_model_path")
        assert path == "/path/to/model.prt"

    def test_persistence(self, tmp_path):
        """Test save/load cycle."""
        session = AtomizerSessionState(session_id="test_persist")
        session.exposed.task_type = TaskType.ANALYZE_RESULTS
        session.exposed.study_name = "persist_study"
        session.add_action("Test action")

        save_path = tmp_path / "session.json"
        session.save(save_path)

        loaded = AtomizerSessionState.load(save_path)

        assert loaded.session_id == "test_persist"
        assert loaded.exposed.task_type == TaskType.ANALYZE_RESULTS
        assert loaded.exposed.study_name == "persist_study"


class TestCompactionManager:
    """Tests for context compaction."""

    def test_create_manager(self):
        """Test creating compaction manager."""
        manager = CompactionManager(compaction_threshold=10, keep_recent=5)

        assert manager.compaction_threshold == 10
        assert manager.keep_recent == 5
        assert len(manager.events) == 0

    def test_add_events(self):
        """Test adding events."""
        manager = CompactionManager(compaction_threshold=50)

        manager.add_trial_event(trial_number=1, success=True, objective=100.0)
        manager.add_trial_event(trial_number=2, success=False)

        assert len(manager.events) == 2

    def test_compaction_trigger(self):
        """Test that compaction triggers at threshold."""
        manager = CompactionManager(compaction_threshold=10, keep_recent=5)

        for i in range(15):
            manager.add_event(ContextEvent(
                timestamp=datetime.now(),
                event_type=EventType.TRIAL_COMPLETE,
                summary=f"Trial {i} complete",
                details={"trial_number": i, "objective": i * 0.1}
            ))

        assert manager.compaction_count > 0
        assert len(manager.events) <= 10

    def test_error_preservation(self):
        """Test that errors are never compacted."""
        manager = CompactionManager(compaction_threshold=10, keep_recent=3)

        # Add error early
        manager.add_error_event("Critical solver failure", "solver_error")

        # Add many regular events
        for i in range(20):
            manager.add_trial_event(trial_number=i, success=True, objective=i)

        # Error should still be present
        errors = [e for e in manager.events if e.event_type == EventType.ERROR]
        assert len(errors) == 1
        assert "Critical solver failure" in errors[0].summary

    def test_milestone_preservation(self):
        """Test that milestones are preserved."""
        manager = CompactionManager(compaction_threshold=10, keep_recent=3)

        manager.add_milestone("Optimization started", {"method": "TPE"})

        for i in range(20):
            manager.add_trial_event(trial_number=i, success=True)

        # Milestone should be preserved
        milestones = [e for e in manager.events if e.event_type == EventType.MILESTONE]
        assert len(milestones) == 1

    def test_context_string_generation(self):
        """Test context string generation."""
        manager = CompactionManager()

        manager.add_trial_event(trial_number=1, success=True, objective=100.0)
        manager.add_error_event("Test error")

        context = manager.get_context_string()

        assert "Optimization History" in context
        assert "Trial 1" in context
        assert "Test error" in context

    def test_get_stats(self):
        """Test statistics generation."""
        manager = CompactionManager(compaction_threshold=10, keep_recent=5)

        for i in range(15):
            manager.add_trial_event(trial_number=i, success=i % 2 == 0)

        stats = manager.get_stats()

        assert stats["total_events"] <= 15
        assert stats["compaction_count"] > 0


class TestCacheMonitor:
    """Tests for cache monitoring."""

    def test_create_optimizer(self):
        """Test creating cache optimizer."""
        optimizer = ContextCacheOptimizer()

        assert optimizer.stats.total_requests == 0
        assert optimizer.stats.cache_hits == 0

    def test_prepare_context(self):
        """Test context preparation."""
        optimizer = ContextCacheOptimizer()

        context = optimizer.prepare_context(
            stable_prefix="Stable content",
            semi_stable="Session content",
            dynamic="User message"
        )

        assert "Stable content" in context
        assert "Session content" in context
        assert "User message" in context
        assert optimizer.stats.total_requests == 1

    def test_cache_hit_detection(self):
        """Test cache hit detection."""
        optimizer = ContextCacheOptimizer()

        # First request
        optimizer.prepare_context("Stable", "Semi", "Dynamic 1")

        # Second request with same stable prefix
        optimizer.prepare_context("Stable", "Semi", "Dynamic 2")

        assert optimizer.stats.total_requests == 2
        assert optimizer.stats.cache_hits == 1

    def test_cache_miss_detection(self):
        """Test cache miss detection."""
        optimizer = ContextCacheOptimizer()

        optimizer.prepare_context("Stable 1", "Semi", "Dynamic")
        optimizer.prepare_context("Stable 2", "Semi", "Dynamic")  # Different prefix

        assert optimizer.stats.cache_hits == 0
        assert optimizer.stats.cache_misses == 2

    def test_stable_prefix_builder(self):
        """Test stable prefix builder."""
        builder = StablePrefixBuilder()

        builder.add_identity("I am Atomizer")
        builder.add_capabilities("I can optimize")
        builder.add_tools("Tool definitions here")

        prefix = builder.build()

        assert "I am Atomizer" in prefix
        assert "I can optimize" in prefix
        # Identity should come before capabilities (order 10 < 20)
        assert prefix.index("Atomizer") < prefix.index("optimize")


class TestFeedbackLoop:
    """Tests for the feedback loop."""

    def test_create_feedback_loop(self, tmp_path):
        """Test creating feedback loop."""
        playbook_path = tmp_path / "playbook.json"
        feedback = FeedbackLoop(playbook_path)

        assert feedback.playbook is not None
        assert feedback._total_trials_processed == 0

    def test_process_successful_trial(self, tmp_path):
        """Test processing successful trial."""
        playbook_path = tmp_path / "playbook.json"
        feedback = FeedbackLoop(playbook_path)

        result = feedback.process_trial_result(
            trial_number=1,
            success=True,
            objective_value=100.0,
            design_variables={"thickness": 1.0}
        )

        assert result["trial_number"] == 1
        assert result["success"] is True
        assert feedback._total_trials_processed == 1
        assert feedback._successful_trials == 1

    def test_process_failed_trial(self, tmp_path):
        """Test processing failed trial."""
        playbook_path = tmp_path / "playbook.json"
        feedback = FeedbackLoop(playbook_path)

        result = feedback.process_trial_result(
            trial_number=1,
            success=False,
            objective_value=0.0,
            design_variables={"thickness": 0.5},
            errors=["Convergence failure"]
        )

        assert result["success"] is False
        assert feedback._failed_trials == 1

    def test_finalize_study(self, tmp_path):
        """Test study finalization."""
        playbook_path = tmp_path / "playbook.json"
        feedback = FeedbackLoop(playbook_path)

        # Process some trials
        for i in range(10):
            feedback.process_trial_result(
                trial_number=i,
                success=i % 3 != 0,
                objective_value=100 - i if i % 3 != 0 else 0,
                design_variables={"x": i * 0.1}
            )

        # Finalize
        result = feedback.finalize_study({
            "name": "test_study",
            "total_trials": 10,
            "best_value": 91,
            "convergence_rate": 0.7
        })

        assert result["insights_added"] > 0
        assert result["playbook_size"] > 0
        assert playbook_path.exists()  # Should be saved

    def test_playbook_item_attribution(self, tmp_path):
        """Test that playbook items get updated based on outcomes."""
        playbook_path = tmp_path / "playbook.json"

        # Pre-populate playbook
        playbook = AtomizerPlaybook()
        item = playbook.add_insight(InsightCategory.STRATEGY, "Test strategy")
        playbook.save(playbook_path)

        # Create feedback loop and process trials with this item active
        feedback = FeedbackLoop(playbook_path)

        feedback.process_trial_result(
            trial_number=1,
            success=True,
            objective_value=100.0,
            design_variables={},
            context_items_used=[item.id]
        )

        feedback.process_trial_result(
            trial_number=2,
            success=True,
            objective_value=95.0,
            design_variables={},
            context_items_used=[item.id]
        )

        # Item should have positive feedback
        assert feedback.playbook.items[item.id].helpful_count == 2


class TestContextBudgetManager:
    """Tests for context budget management."""

    def test_create_manager(self):
        """Test creating budget manager."""
        manager = ContextBudgetManager()

        assert manager.budget["total"] == 100000
        assert "stable_prefix" in manager.budget

    def test_estimate_tokens(self):
        """Test token estimation."""
        manager = ContextBudgetManager()

        tokens = manager.estimate_tokens("Hello world")  # 11 chars
        assert tokens == 2  # 11 / 4 = 2.75 -> 2

    def test_update_usage(self):
        """Test usage tracking."""
        manager = ContextBudgetManager()

        result = manager.update_usage("stable_prefix", "x" * 20000)  # 5000 tokens

        assert result["section"] == "stable_prefix"
        assert result["tokens"] == 5000
        assert result["over_budget"] is False

    def test_over_budget_warning(self):
        """Test over-budget detection."""
        manager = ContextBudgetManager()

        # Exceed stable_prefix budget (5000 tokens = 20000 chars)
        result = manager.update_usage("stable_prefix", "x" * 40000)  # 10000 tokens

        assert result["over_budget"] is True
        assert "warning" in result

    def test_get_status(self):
        """Test overall status reporting."""
        manager = ContextBudgetManager()

        manager.update_usage("stable_prefix", "x" * 10000)
        manager.update_usage("protocols", "x" * 20000)

        status = manager.get_status()

        assert "total_used" in status
        assert "utilization" in status
        assert "recommendations" in status


if __name__ == "__main__":
    pytest.main([__file__, "-v"])