Initial project scaffold - Phase 1 MVP structure

Core modules: - cli.py: Command-line interface with Click - pipeline.py: Main orchestrator - video_processor.py: Frame extraction with ffmpeg - audio_analyzer.py: Whisper transcription - vision_analyzer.py: Component detection (placeholder) - doc_generator.py: Markdown + PDF output Also includes: - pyproject.toml with uv/hatch config - Prompts for AI analysis - Basic tests - ROADMAP.md with 4-week plan
2026-01-27 20:05:34 +00:00
parent 621234cbdf
commit 1e94a98e5b
16 changed files with 1062 additions and 1 deletions
--- a/src/cad_documenter/init.py
+++ b/src/cad_documenter/init.py
@@ -0,0 +1,3 @@
+"""CAD-Documenter: Video walkthrough → Complete engineering documentation."""
+
+__version__ = "0.1.0"
--- a/src/cad_documenter/audio_analyzer.py
+++ b/src/cad_documenter/audio_analyzer.py
@@ -0,0 +1,104 @@
+"""Audio analysis module - transcription via Whisper."""
+
+from pathlib import Path
+from dataclasses import dataclass
+import subprocess
+import tempfile
+
+
+@dataclass
+class TranscriptSegment:
+    """A segment of transcribed audio."""
+    start: float  # seconds
+    end: float
+    text: str
+
+
+@dataclass
+class Transcript:
+    """Full transcript with segments."""
+    segments: list[TranscriptSegment]
+    full_text: str
+
+    def get_text_at(self, timestamp: float, window: float = 5.0) -> str:
+        """Get transcript text around a specific timestamp."""
+        relevant = []
+        for seg in self.segments:
+            if seg.start <= timestamp + window and seg.end >= timestamp - window:
+                relevant.append(seg.text)
+        return " ".join(relevant)
+
+
+class AudioAnalyzer:
+    """Handles audio transcription using Whisper."""
+
+    def __init__(self, video_path: Path, model: str = "base"):
+        self.video_path = video_path
+        self.model = model
+
+    def transcribe(self) -> Transcript:
+        """
+        Transcribe audio from video using Whisper.
+
+        Returns:
+            Transcript object with segments and full text
+        """
+        # Extract audio to temp file
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+            audio_path = Path(f.name)
+
+        # Extract audio using ffmpeg
+        cmd = [
+            "ffmpeg", "-y",
+            "-i", str(self.video_path),
+            "-vn", "-acodec", "pcm_s16le",
+            "-ar", "16000", "-ac", "1",
+            str(audio_path)
+        ]
+        subprocess.run(cmd, capture_output=True)
+
+        # Run Whisper
+        try:
+            import whisper
+            model = whisper.load_model(self.model)
+            result = model.transcribe(str(audio_path), word_timestamps=True)
+
+            segments = []
+            for seg in result.get("segments", []):
+                segments.append(TranscriptSegment(
+                    start=seg["start"],
+                    end=seg["end"],
+                    text=seg["text"].strip()
+                ))
+
+            return Transcript(
+                segments=segments,
+                full_text=result.get("text", "").strip()
+            )
+
+        finally:
+            # Cleanup temp file
+            audio_path.unlink(missing_ok=True)
+
+    def extract_keywords(self, transcript: Transcript) -> list[str]:
+        """Extract likely component names and technical terms."""
+        # Simple keyword extraction - can be enhanced with NLP
+        keywords = []
+        indicator_phrases = [
+            "this is the", "this is a", "here we have",
+            "the main", "called the", "known as",
+            "this part", "this component", "this assembly"
+        ]
+
+        text_lower = transcript.full_text.lower()
+        for phrase in indicator_phrases:
+            if phrase in text_lower:
+                # Find what comes after the phrase
+                idx = text_lower.find(phrase)
+                after = transcript.full_text[idx + len(phrase):idx + len(phrase) + 50]
+                # Take first few words
+                words = after.strip().split()[:3]
+                if words:
+                    keywords.append(" ".join(words).strip(",.;:"))
+
+        return list(set(keywords))
--- a/src/cad_documenter/cli.py
+++ b/src/cad_documenter/cli.py
@@ -0,0 +1,86 @@
+"""CAD-Documenter CLI - Main entry point."""
+
+import click
+from pathlib import Path
+from rich.console import Console
+
+from .pipeline import DocumentationPipeline
+
+console = Console()
+
+
+@click.command()
+@click.argument("video", type=click.Path(exists=True, path_type=Path))
+@click.option("-o", "--output", type=click.Path(path_type=Path), help="Output directory")
+@click.option("--frames-only", is_flag=True, help="Only extract frames, skip documentation")
+@click.option("--atomizer-hints", is_flag=True, help="Generate Atomizer FEA hints")
+@click.option("--bom", is_flag=True, help="Generate Bill of Materials")
+@click.option("--pdf", is_flag=True, help="Generate PDF via Atomaste Report Standard")
+@click.option("--frame-interval", default=2.0, help="Seconds between frame extractions")
+@click.option("--whisper-model", default="base", help="Whisper model size (tiny/base/small/medium/large)")
+@click.version_option()
+def main(
+    video: Path,
+    output: Path | None,
+    frames_only: bool,
+    atomizer_hints: bool,
+    bom: bool,
+    pdf: bool,
+    frame_interval: float,
+    whisper_model: str,
+):
+    """
+    Generate engineering documentation from a CAD walkthrough video.
+
+    VIDEO: Path to the video file (.mp4, .mov, .avi, etc.)
+    """
+    console.print(f"[bold blue]CAD-Documenter[/bold blue] v0.1.0")
+    console.print(f"Processing: [cyan]{video}[/cyan]")
+
+    # Default output directory
+    if output is None:
+        output = video.parent / f"{video.stem}_docs"
+
+    output.mkdir(parents=True, exist_ok=True)
+
+    # Run pipeline
+    pipeline = DocumentationPipeline(
+        video_path=video,
+        output_dir=output,
+        frame_interval=frame_interval,
+        whisper_model=whisper_model,
+    )
+
+    if frames_only:
+        console.print("[yellow]Extracting frames only...[/yellow]")
+        pipeline.extract_frames()
+        console.print(f"[green]✓[/green] Frames saved to {output / 'frames'}")
+        return
+
+    # Full pipeline
+    console.print("[yellow]Step 1/4:[/yellow] Extracting frames...")
+    frames = pipeline.extract_frames()
+    console.print(f"  [green]✓[/green] Extracted {len(frames)} frames")
+
+    console.print("[yellow]Step 2/4:[/yellow] Transcribing audio...")
+    transcript = pipeline.transcribe_audio()
+    console.print(f"  [green]✓[/green] Transcribed {len(transcript.segments)} segments")
+
+    console.print("[yellow]Step 3/4:[/yellow] Analyzing components...")
+    analysis = pipeline.analyze_components(frames, transcript)
+    console.print(f"  [green]✓[/green] Identified {len(analysis.components)} components")
+
+    console.print("[yellow]Step 4/4:[/yellow] Generating documentation...")
+    doc_path = pipeline.generate_documentation(analysis, atomizer_hints=atomizer_hints, bom=bom)
+    console.print(f"  [green]✓[/green] Documentation saved to {doc_path}")
+
+    if pdf:
+        console.print("[yellow]Generating PDF...[/yellow]")
+        pdf_path = pipeline.generate_pdf(doc_path)
+        console.print(f"  [green]✓[/green] PDF saved to {pdf_path}")
+
+    console.print(f"\n[bold green]Done![/bold green] Output: {output}")
+
+
+if __name__ == "__main__":
+    main()
--- a/src/cad_documenter/doc_generator.py
+++ b/src/cad_documenter/doc_generator.py
@@ -0,0 +1,218 @@
+"""Documentation generator - produces markdown and PDF output."""
+
+from pathlib import Path
+from datetime import datetime
+
+from jinja2 import Environment, FileSystemLoader, BaseLoader
+
+from .vision_analyzer import ComponentAnalysis, Component
+
+
+# Default template embedded in code (can be overridden by files)
+DEFAULT_TEMPLATE = '''# {{ analysis.assembly_name }} - Technical Documentation
+
+**Generated:** {{ timestamp }}
+**Source:** Video walkthrough documentation
+
+---
+
+## Executive Summary
+
+{{ analysis.summary }}
+
+---
+
+## Components
+
+{% for component in analysis.components %}
+### {{ loop.index }}. {{ component.name }}
+
+{% if component.description %}
+{{ component.description }}
+{% endif %}
+
+{% if component.function %}
+- **Function:** {{ component.function }}
+{% endif %}
+{% if component.material %}
+- **Material:** {{ component.material }}
+{% endif %}
+{% if component.part_number %}
+- **Part Number:** {{ component.part_number }}
+{% endif %}
+
+{% if component.features %}
+**Key Features:**
+{% for feature in component.features %}
+- {{ feature }}
+{% endfor %}
+{% endif %}
+
+{% if component.best_frame %}
+![{{ component.name }}](frames/{{ component.best_frame.path.name }})
+{% endif %}
+
+{% if component.transcript_excerpt %}
+> *From walkthrough:* "{{ component.transcript_excerpt }}"
+{% endif %}
+
+---
+
+{% endfor %}
+
+{% if bom %}
+## Bill of Materials
+
+| Item | P/N | Name | Qty | Material | Notes |
+|------|-----|------|-----|----------|-------|
+{% for component in analysis.components %}
+| {{ loop.index }} | {{ component.part_number or 'TBD' }} | {{ component.name }} | 1 | {{ component.material or 'TBD' }} | {{ component.function }} |
+{% endfor %}
+
+{% endif %}
+
+{% if analysis.assembly_notes %}
+## Assembly Notes
+
+{{ analysis.assembly_notes }}
+
+{% endif %}
+
+{% if atomizer_hints %}
+## Atomizer FEA Hints
+
+Based on the video walkthrough, the following optimization parameters are suggested:
+
+```json
+{
+  "model_understanding": {
+    "components": {{ component_names | tojson }},
+    "materials_mentioned": {{ materials | tojson }}
+  },
+  "suggested_study": {
+    "objectives": [
+      {"name": "mass", "direction": "minimize"}
+    ],
+    "constraints_likely": []
+  }
+}
+```
+
+{% endif %}
+
+---
+
+## Raw Transcript
+
+<details>
+<summary>Click to expand full transcript</summary>
+
+{{ analysis.raw_transcript }}
+
+</details>
+
+---
+
+*Documentation generated by CAD-Documenter*
+'''
+
+
+class DocGenerator:
+    """Generates documentation from analysis results."""
+
+    def __init__(self, output_dir: Path, template_dir: Path | None = None):
+        self.output_dir = output_dir
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+
+        # Setup Jinja environment
+        if template_dir and template_dir.exists():
+            self.env = Environment(loader=FileSystemLoader(template_dir))
+        else:
+            self.env = Environment(loader=BaseLoader())
+
+    def generate(
+        self,
+        analysis: ComponentAnalysis,
+        atomizer_hints: bool = False,
+        bom: bool = False,
+        template_name: str | None = None,
+    ) -> Path:
+        """Generate markdown documentation."""
+        # Load template
+        if template_name:
+            template = self.env.get_template(f"{template_name}.md.j2")
+        else:
+            template = self.env.from_string(DEFAULT_TEMPLATE)
+
+        # Prepare template context
+        context = {
+            "analysis": analysis,
+            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M"),
+            "atomizer_hints": atomizer_hints,
+            "bom": bom,
+            "component_names": [c.name for c in analysis.components],
+            "materials": list(set(c.material for c in analysis.components if c.material)),
+        }
+
+        # Render
+        content = template.render(**context)
+
+        # Write output
+        output_path = self.output_dir / "documentation.md"
+        output_path.write_text(content)
+
+        return output_path
+
+    def generate_pdf(self, markdown_path: Path) -> Path:
+        """
+        Generate PDF from markdown using Atomaste Report Standard.
+
+        Requires the atomaste-reports skill/Typst to be available.
+        """
+        import subprocess
+
+        pdf_path = markdown_path.with_suffix(".pdf")
+
+        # Try to use Atomaste Report Standard if available
+        # Otherwise fall back to pandoc
+        try:
+            # Check if atomaste build script exists
+            build_script = Path("/home/papa/Atomaste/Templates/Atomaste_Report_Standard/scripts/build-report.py")
+            if build_script.exists():
+                cmd = ["python3", str(build_script), str(markdown_path), "-o", str(pdf_path)]
+            else:
+                # Fallback to pandoc
+                cmd = ["pandoc", str(markdown_path), "-o", str(pdf_path)]
+
+            subprocess.run(cmd, capture_output=True, check=True)
+
+        except subprocess.CalledProcessError as e:
+            raise RuntimeError(f"PDF generation failed: {e}")
+
+        return pdf_path
+
+    def generate_atomizer_hints(self, analysis: ComponentAnalysis) -> Path:
+        """Generate standalone Atomizer hints JSON file."""
+        import json
+
+        hints = {
+            "model_understanding": {
+                "assembly_name": analysis.assembly_name,
+                "components": [c.name for c in analysis.components],
+                "materials_mentioned": list(set(c.material for c in analysis.components if c.material)),
+                "functions": {c.name: c.function for c in analysis.components if c.function},
+            },
+            "suggested_spec": {
+                "objectives": [
+                    {"name": "mass", "direction": "minimize"}
+                ],
+                "parameters_likely": [],
+                "constraints_likely": [],
+            },
+            "transcript_highlights": [],
+        }
+
+        output_path = self.output_dir / "atomizer_hints.json"
+        output_path.write_text(json.dumps(hints, indent=2))
+
+        return output_path
--- a/src/cad_documenter/pipeline.py
+++ b/src/cad_documenter/pipeline.py
@@ -0,0 +1,64 @@
+"""Main documentation pipeline orchestrator."""
+
+from pathlib import Path
+from dataclasses import dataclass, field
+
+from .video_processor import VideoProcessor, FrameInfo
+from .audio_analyzer import AudioAnalyzer, Transcript
+from .vision_analyzer import VisionAnalyzer, ComponentAnalysis
+from .doc_generator import DocGenerator
+
+
+@dataclass
+class PipelineConfig:
+    """Pipeline configuration."""
+    frame_interval: float = 2.0
+    whisper_model: str = "base"
+    vision_model: str = "gpt-4o"  # or local model
+
+
+@dataclass
+class DocumentationPipeline:
+    """Orchestrates the full documentation pipeline."""
+
+    video_path: Path
+    output_dir: Path
+    frame_interval: float = 2.0
+    whisper_model: str = "base"
+
+    def __post_init__(self):
+        self.video_processor = VideoProcessor(self.video_path, self.output_dir / "frames")
+        self.audio_analyzer = AudioAnalyzer(self.video_path, self.whisper_model)
+        self.vision_analyzer = VisionAnalyzer()
+        self.doc_generator = DocGenerator(self.output_dir)
+
+    def extract_frames(self) -> list[FrameInfo]:
+        """Extract key frames from video."""
+        return self.video_processor.extract_frames(interval=self.frame_interval)
+
+    def transcribe_audio(self) -> Transcript:
+        """Transcribe audio track."""
+        return self.audio_analyzer.transcribe()
+
+    def analyze_components(
+        self, frames: list[FrameInfo], transcript: Transcript
+    ) -> ComponentAnalysis:
+        """Analyze frames + transcript to identify components."""
+        return self.vision_analyzer.analyze(frames, transcript)
+
+    def generate_documentation(
+        self,
+        analysis: ComponentAnalysis,
+        atomizer_hints: bool = False,
+        bom: bool = False,
+    ) -> Path:
+        """Generate markdown documentation."""
+        return self.doc_generator.generate(
+            analysis,
+            atomizer_hints=atomizer_hints,
+            bom=bom,
+        )
+
+    def generate_pdf(self, markdown_path: Path) -> Path:
+        """Generate PDF from markdown using Atomaste Report Standard."""
+        return self.doc_generator.generate_pdf(markdown_path)
--- a/src/cad_documenter/video_processor.py
+++ b/src/cad_documenter/video_processor.py
@@ -0,0 +1,112 @@
+"""Video processing module - frame extraction and scene detection."""
+
+import subprocess
+import json
+from pathlib import Path
+from dataclasses import dataclass
+
+
+@dataclass
+class FrameInfo:
+    """Information about an extracted frame."""
+    path: Path
+    timestamp: float  # seconds
+    frame_number: int
+
+
+class VideoProcessor:
+    """Handles video frame extraction using ffmpeg."""
+
+    def __init__(self, video_path: Path, output_dir: Path):
+        self.video_path = video_path
+        self.output_dir = output_dir
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+
+    def get_duration(self) -> float:
+        """Get video duration in seconds."""
+        cmd = [
+            "ffprobe", "-v", "quiet",
+            "-print_format", "json",
+            "-show_format",
+            str(self.video_path)
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        data = json.loads(result.stdout)
+        return float(data["format"]["duration"])
+
+    def extract_frames(self, interval: float = 2.0) -> list[FrameInfo]:
+        """
+        Extract frames at regular intervals.
+
+        Args:
+            interval: Seconds between frame extractions
+
+        Returns:
+            List of FrameInfo objects for extracted frames
+        """
+        duration = self.get_duration()
+        frames = []
+
+        # Use ffmpeg to extract frames at interval
+        output_pattern = self.output_dir / "frame_%04d.jpg"
+        cmd = [
+            "ffmpeg", "-y",
+            "-i", str(self.video_path),
+            "-vf", f"fps=1/{interval}",
+            "-q:v", "2",  # High quality JPEG
+            str(output_pattern)
+        ]
+        subprocess.run(cmd, capture_output=True)
+
+        # Collect extracted frames
+        for i, frame_path in enumerate(sorted(self.output_dir.glob("frame_*.jpg"))):
+            timestamp = i * interval
+            frames.append(FrameInfo(
+                path=frame_path,
+                timestamp=timestamp,
+                frame_number=i
+            ))
+
+        return frames
+
+    def extract_audio(self, output_path: Path | None = None) -> Path:
+        """Extract audio track from video."""
+        if output_path is None:
+            output_path = self.output_dir.parent / "audio.wav"
+
+        cmd = [
+            "ffmpeg", "-y",
+            "-i", str(self.video_path),
+            "-vn",  # No video
+            "-acodec", "pcm_s16le",
+            "-ar", "16000",  # 16kHz for Whisper
+            "-ac", "1",  # Mono
+            str(output_path)
+        ]
+        subprocess.run(cmd, capture_output=True)
+        return output_path
+
+    def detect_scene_changes(self, threshold: float = 0.3) -> list[float]:
+        """
+        Detect scene changes in video.
+
+        Returns list of timestamps where significant visual changes occur.
+        """
+        cmd = [
+            "ffmpeg", "-i", str(self.video_path),
+            "-vf", f"select='gt(scene,{threshold})',showinfo",
+            "-f", "null", "-"
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True)
+
+        # Parse scene change timestamps from ffmpeg output
+        timestamps = []
+        for line in result.stderr.split("\n"):
+            if "pts_time:" in line:
+                # Extract timestamp
+                parts = line.split("pts_time:")
+                if len(parts) > 1:
+                    ts = float(parts[1].split()[0])
+                    timestamps.append(ts)
+
+        return timestamps
--- a/src/cad_documenter/vision_analyzer.py
+++ b/src/cad_documenter/vision_analyzer.py
@@ -0,0 +1,111 @@
+"""Vision analysis module - component detection and feature extraction."""
+
+from pathlib import Path
+from dataclasses import dataclass, field
+
+from .video_processor import FrameInfo
+from .audio_analyzer import Transcript
+
+
+@dataclass
+class Component:
+    """A detected component from the CAD model."""
+    name: str
+    description: str
+    function: str = ""
+    material: str = ""
+    features: list[str] = field(default_factory=list)
+    best_frame: FrameInfo | None = None
+    transcript_excerpt: str = ""
+    part_number: str = ""  # For Part Manager integration
+
+
+@dataclass
+class ComponentAnalysis:
+    """Complete analysis results."""
+    assembly_name: str
+    summary: str
+    components: list[Component]
+    assembly_notes: str = ""
+    raw_transcript: str = ""
+
+
+class VisionAnalyzer:
+    """Analyzes frames to identify components and features."""
+
+    def __init__(self, model: str = "gpt-4o"):
+        self.model = model
+
+    def analyze(
+        self, frames: list[FrameInfo], transcript: Transcript
+    ) -> ComponentAnalysis:
+        """
+        Analyze frames and transcript to identify components.
+
+        This is where the AI magic happens - correlating visual and verbal info.
+        """
+        # For MVP, we'll use a multi-modal approach:
+        # 1. Send key frames to vision model with transcript context
+        # 2. Ask it to identify components and correlate with verbal descriptions
+
+        # Placeholder implementation - will be enhanced with actual AI calls
+        components = self._identify_components(frames, transcript)
+        summary = self._generate_summary(components, transcript)
+
+        return ComponentAnalysis(
+            assembly_name=self._extract_assembly_name(transcript),
+            summary=summary,
+            components=components,
+            assembly_notes=self._extract_assembly_notes(transcript),
+            raw_transcript=transcript.full_text,
+        )
+
+    def _identify_components(
+        self, frames: list[FrameInfo], transcript: Transcript
+    ) -> list[Component]:
+        """Identify individual components from frames + transcript."""
+        # TODO: Implement vision API calls
+        # For now, return empty list - will be implemented in Phase 1
+        return []
+
+    def _generate_summary(
+        self, components: list[Component], transcript: Transcript
+    ) -> str:
+        """Generate executive summary of the assembly."""
+        # TODO: Implement with LLM
+        return f"Assembly documentation generated from video walkthrough. {len(components)} components identified."
+
+    def _extract_assembly_name(self, transcript: Transcript) -> str:
+        """Try to extract assembly name from transcript."""
+        # Look for common patterns
+        text = transcript.full_text.lower()
+        patterns = ["this is the", "presenting the", "looking at the", "reviewing the"]
+        for pattern in patterns:
+            if pattern in text:
+                idx = text.find(pattern) + len(pattern)
+                name = transcript.full_text[idx:idx + 50].strip().split(".")[0]
+                return name.strip()
+        return "Untitled Assembly"
+
+    def _extract_assembly_notes(self, transcript: Transcript) -> str:
+        """Extract assembly-related notes from transcript."""
+        # Look for assembly instructions in transcript
+        keywords = ["assemble", "install", "mount", "attach", "connect"]
+        notes = []
+        for seg in transcript.segments:
+            if any(kw in seg.text.lower() for kw in keywords):
+                notes.append(seg.text)
+        return " ".join(notes) if notes else ""
+
+    def analyze_single_frame(self, frame: FrameInfo, context: str = "") -> dict:
+        """
+        Analyze a single frame for components and features.
+
+        Returns dict with detected components, features, and confidence.
+        """
+        # TODO: Implement with vision API
+        return {
+            "components": [],
+            "features": [],
+            "confidence": 0.0
+        }