Full implementation - Vision AI, config, improved pipeline

Major changes:
- vision_analyzer.py: Real OpenAI/Anthropic vision API integration
  - Component detection with confidence scores
  - Atomizer hints extraction (objectives, constraints, parameters)
  - Material and feature identification
  - Timeline correlation with transcript

- config.py: Full configuration system
  - API settings (provider, keys, models)
  - Processing settings (Whisper model, frame interval, scene detection)
  - Output settings (BOM, hints, PDF template)
  - Config file support (~/.cad-documenter.toml)

- audio_analyzer.py: Enhanced transcription
  - Audio stream detection
  - Graceful fallback for missing audio
  - Keyword extraction
  - Technical term detection
  - Timeline correlation

- video_processor.py: Smart frame extraction
  - Scene change detection via ffmpeg
  - Configurable thresholds
  - Best frame selection

- doc_generator.py: Improved output
  - Better Markdown templates
  - BOM CSV export
  - Atomizer hints JSON
  - Component cards

- cli.py: Rich CLI with progress indicators
  - Config file support
  - --init-config flag
  - Verbose mode
  - Better error messages

- tests: Comprehensive test suite
This commit is contained in:
Mario Lavoie
2026-01-27 20:16:44 +00:00
parent 1e94a98e5b
commit 148180c12e
9 changed files with 2084 additions and 270 deletions

View File

@@ -0,0 +1,179 @@
"""Configuration management for CAD-Documenter."""
import os
from pathlib import Path
from dataclasses import dataclass, field
from typing import Literal
try:
import tomllib
except ImportError:
import tomli as tomllib
@dataclass
class TranscriptionConfig:
"""Transcription configuration."""
model: str = "base" # tiny, base, small, medium, large
language: str | None = None # None = auto-detect
@dataclass
class APIConfig:
"""API configuration."""
provider: Literal["openai", "anthropic"] = "openai"
api_key: str | None = None
vision_model: str | None = None # None = use provider default
text_model: str | None = None
@dataclass
class ProcessingConfig:
"""Video/audio processing configuration."""
whisper_model: str = "base"
frame_interval: float = 2.0
use_scene_detection: bool = True
max_frames: int = 15
scene_threshold: float = 0.3
@dataclass
class OutputConfig:
"""Output configuration."""
include_bom: bool = True
include_atomizer_hints: bool = True
include_raw_transcript: bool = True
include_frames: bool = True
pdf_template: str = "default"
@dataclass
class Config:
"""Main configuration."""
api: APIConfig = field(default_factory=APIConfig)
processing: ProcessingConfig = field(default_factory=ProcessingConfig)
output: OutputConfig = field(default_factory=OutputConfig)
def load_config(config_path: Path | None = None) -> Config:
"""
Load configuration from file and environment variables.
Priority (highest to lowest):
1. Environment variables
2. Config file
3. Defaults
"""
config = Config()
# Try to load config file
if config_path is None:
# Check common locations
locations = [
Path.cwd() / "cad-documenter.toml",
Path.cwd() / ".cad-documenter.toml",
Path.home() / ".cad-documenter.toml",
Path.home() / ".config" / "cad-documenter" / "config.toml",
]
for loc in locations:
if loc.exists():
config_path = loc
break
if config_path and config_path.exists():
with open(config_path, "rb") as f:
data = tomllib.load(f)
# API config
if "api" in data:
api_data = data["api"]
config.api.provider = api_data.get("provider", config.api.provider)
config.api.api_key = api_data.get("api_key", config.api.api_key)
config.api.vision_model = api_data.get("vision_model", config.api.vision_model)
config.api.text_model = api_data.get("text_model", config.api.text_model)
# Processing config
if "processing" in data:
proc_data = data["processing"]
config.processing.whisper_model = proc_data.get("whisper_model", config.processing.whisper_model)
config.processing.frame_interval = proc_data.get("frame_interval", config.processing.frame_interval)
config.processing.use_scene_detection = proc_data.get("use_scene_detection", config.processing.use_scene_detection)
config.processing.max_frames = proc_data.get("max_frames", config.processing.max_frames)
config.processing.scene_threshold = proc_data.get("scene_threshold", config.processing.scene_threshold)
# Output config
if "output" in data:
out_data = data["output"]
config.output.include_bom = out_data.get("include_bom", config.output.include_bom)
config.output.include_atomizer_hints = out_data.get("include_atomizer_hints", config.output.include_atomizer_hints)
config.output.include_raw_transcript = out_data.get("include_raw_transcript", config.output.include_raw_transcript)
config.output.include_frames = out_data.get("include_frames", config.output.include_frames)
config.output.pdf_template = out_data.get("pdf_template", config.output.pdf_template)
# Override with environment variables
if os.environ.get("CAD_DOC_PROVIDER"):
config.api.provider = os.environ["CAD_DOC_PROVIDER"]
if os.environ.get("OPENAI_API_KEY"):
if config.api.provider == "openai" and not config.api.api_key:
config.api.api_key = os.environ["OPENAI_API_KEY"]
if os.environ.get("ANTHROPIC_API_KEY"):
if config.api.provider == "anthropic" and not config.api.api_key:
config.api.api_key = os.environ["ANTHROPIC_API_KEY"]
if os.environ.get("CAD_DOC_WHISPER_MODEL"):
config.processing.whisper_model = os.environ["CAD_DOC_WHISPER_MODEL"]
return config
def create_default_config(path: Path) -> None:
"""Create a default config file."""
content = '''# CAD-Documenter Configuration
[api]
# Vision API provider: "openai" or "anthropic"
provider = "openai"
# API key (or set OPENAI_API_KEY / ANTHROPIC_API_KEY environment variable)
# api_key = "sk-..."
# Model overrides (optional - uses provider defaults if not set)
# vision_model = "gpt-4o"
# text_model = "gpt-4o-mini"
[processing]
# Whisper model for transcription: tiny, base, small, medium, large
whisper_model = "base"
# Seconds between frame extractions (if not using scene detection)
frame_interval = 2.0
# Use scene change detection for smarter frame selection
use_scene_detection = true
# Maximum frames to send to vision API
max_frames = 15
# Scene detection sensitivity (0.0-1.0, lower = more sensitive)
scene_threshold = 0.3
[output]
# Include Bill of Materials in documentation
include_bom = true
# Include Atomizer FEA hints
include_atomizer_hints = true
# Include raw transcript at end of documentation
include_raw_transcript = true
# Include extracted frames in output directory
include_frames = true
# PDF template name (for --pdf option)
pdf_template = "default"
'''
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(content)