Full implementation - Vision AI, config, improved pipeline

Major changes: - vision_analyzer.py: Real OpenAI/Anthropic vision API integration - Component detection with confidence scores - Atomizer hints extraction (objectives, constraints, parameters) - Material and feature identification - Timeline correlation with transcript - config.py: Full configuration system - API settings (provider, keys, models) - Processing settings (Whisper model, frame interval, scene detection) - Output settings (BOM, hints, PDF template) - Config file support (~/.cad-documenter.toml) - audio_analyzer.py: Enhanced transcription - Audio stream detection - Graceful fallback for missing audio - Keyword extraction - Technical term detection - Timeline correlation - video_processor.py: Smart frame extraction - Scene change detection via ffmpeg - Configurable thresholds - Best frame selection - doc_generator.py: Improved output - Better Markdown templates - BOM CSV export - Atomizer hints JSON - Component cards - cli.py: Rich CLI with progress indicators - Config file support - --init-config flag - Verbose mode - Better error messages - tests: Comprehensive test suite
2026-01-27 20:16:44 +00:00
parent 1e94a98e5b
commit 148180c12e
9 changed files with 2084 additions and 270 deletions
--- a/src/cad_documenter/config.py
+++ b/src/cad_documenter/config.py
@@ -0,0 +1,179 @@
+"""Configuration management for CAD-Documenter."""
+
+import os
+from pathlib import Path
+from dataclasses import dataclass, field
+from typing import Literal
+
+try:
+    import tomllib
+except ImportError:
+    import tomli as tomllib
+
+
+@dataclass
+class TranscriptionConfig:
+    """Transcription configuration."""
+    model: str = "base"  # tiny, base, small, medium, large
+    language: str | None = None  # None = auto-detect
+
+
+@dataclass
+class APIConfig:
+    """API configuration."""
+    provider: Literal["openai", "anthropic"] = "openai"
+    api_key: str | None = None
+    vision_model: str | None = None  # None = use provider default
+    text_model: str | None = None
+
+
+@dataclass
+class ProcessingConfig:
+    """Video/audio processing configuration."""
+    whisper_model: str = "base"
+    frame_interval: float = 2.0
+    use_scene_detection: bool = True
+    max_frames: int = 15
+    scene_threshold: float = 0.3
+
+
+@dataclass
+class OutputConfig:
+    """Output configuration."""
+    include_bom: bool = True
+    include_atomizer_hints: bool = True
+    include_raw_transcript: bool = True
+    include_frames: bool = True
+    pdf_template: str = "default"
+
+
+@dataclass
+class Config:
+    """Main configuration."""
+    api: APIConfig = field(default_factory=APIConfig)
+    processing: ProcessingConfig = field(default_factory=ProcessingConfig)
+    output: OutputConfig = field(default_factory=OutputConfig)
+
+
+def load_config(config_path: Path | None = None) -> Config:
+    """
+    Load configuration from file and environment variables.
+    
+    Priority (highest to lowest):
+    1. Environment variables
+    2. Config file
+    3. Defaults
+    """
+    config = Config()
+    
+    # Try to load config file
+    if config_path is None:
+        # Check common locations
+        locations = [
+            Path.cwd() / "cad-documenter.toml",
+            Path.cwd() / ".cad-documenter.toml",
+            Path.home() / ".cad-documenter.toml",
+            Path.home() / ".config" / "cad-documenter" / "config.toml",
+        ]
+        for loc in locations:
+            if loc.exists():
+                config_path = loc
+                break
+    
+    if config_path and config_path.exists():
+        with open(config_path, "rb") as f:
+            data = tomllib.load(f)
+        
+        # API config
+        if "api" in data:
+            api_data = data["api"]
+            config.api.provider = api_data.get("provider", config.api.provider)
+            config.api.api_key = api_data.get("api_key", config.api.api_key)
+            config.api.vision_model = api_data.get("vision_model", config.api.vision_model)
+            config.api.text_model = api_data.get("text_model", config.api.text_model)
+        
+        # Processing config
+        if "processing" in data:
+            proc_data = data["processing"]
+            config.processing.whisper_model = proc_data.get("whisper_model", config.processing.whisper_model)
+            config.processing.frame_interval = proc_data.get("frame_interval", config.processing.frame_interval)
+            config.processing.use_scene_detection = proc_data.get("use_scene_detection", config.processing.use_scene_detection)
+            config.processing.max_frames = proc_data.get("max_frames", config.processing.max_frames)
+            config.processing.scene_threshold = proc_data.get("scene_threshold", config.processing.scene_threshold)
+        
+        # Output config
+        if "output" in data:
+            out_data = data["output"]
+            config.output.include_bom = out_data.get("include_bom", config.output.include_bom)
+            config.output.include_atomizer_hints = out_data.get("include_atomizer_hints", config.output.include_atomizer_hints)
+            config.output.include_raw_transcript = out_data.get("include_raw_transcript", config.output.include_raw_transcript)
+            config.output.include_frames = out_data.get("include_frames", config.output.include_frames)
+            config.output.pdf_template = out_data.get("pdf_template", config.output.pdf_template)
+    
+    # Override with environment variables
+    if os.environ.get("CAD_DOC_PROVIDER"):
+        config.api.provider = os.environ["CAD_DOC_PROVIDER"]
+    
+    if os.environ.get("OPENAI_API_KEY"):
+        if config.api.provider == "openai" and not config.api.api_key:
+            config.api.api_key = os.environ["OPENAI_API_KEY"]
+    
+    if os.environ.get("ANTHROPIC_API_KEY"):
+        if config.api.provider == "anthropic" and not config.api.api_key:
+            config.api.api_key = os.environ["ANTHROPIC_API_KEY"]
+    
+    if os.environ.get("CAD_DOC_WHISPER_MODEL"):
+        config.processing.whisper_model = os.environ["CAD_DOC_WHISPER_MODEL"]
+    
+    return config
+
+
+def create_default_config(path: Path) -> None:
+    """Create a default config file."""
+    content = '''# CAD-Documenter Configuration
+
+[api]
+# Vision API provider: "openai" or "anthropic"
+provider = "openai"
+
+# API key (or set OPENAI_API_KEY / ANTHROPIC_API_KEY environment variable)
+# api_key = "sk-..."
+
+# Model overrides (optional - uses provider defaults if not set)
+# vision_model = "gpt-4o"
+# text_model = "gpt-4o-mini"
+
+[processing]
+# Whisper model for transcription: tiny, base, small, medium, large
+whisper_model = "base"
+
+# Seconds between frame extractions (if not using scene detection)
+frame_interval = 2.0
+
+# Use scene change detection for smarter frame selection
+use_scene_detection = true
+
+# Maximum frames to send to vision API
+max_frames = 15
+
+# Scene detection sensitivity (0.0-1.0, lower = more sensitive)
+scene_threshold = 0.3
+
+[output]
+# Include Bill of Materials in documentation
+include_bom = true
+
+# Include Atomizer FEA hints
+include_atomizer_hints = true
+
+# Include raw transcript at end of documentation
+include_raw_transcript = true
+
+# Include extracted frames in output directory
+include_frames = true
+
+# PDF template name (for --pdf option)
+pdf_template = "default"
+'''
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(content)