Initial project scaffold - Phase 1 MVP structure

Core modules:
- cli.py: Command-line interface with Click
- pipeline.py: Main orchestrator
- video_processor.py: Frame extraction with ffmpeg
- audio_analyzer.py: Whisper transcription
- vision_analyzer.py: Component detection (placeholder)
- doc_generator.py: Markdown + PDF output

Also includes:
- pyproject.toml with uv/hatch config
- Prompts for AI analysis
- Basic tests
- ROADMAP.md with 4-week plan
This commit is contained in:
Mario Lavoie
2026-01-27 20:05:34 +00:00
parent 621234cbdf
commit 1e94a98e5b
16 changed files with 1062 additions and 1 deletions

View File

@@ -0,0 +1,3 @@
"""CAD-Documenter: Video walkthrough → Complete engineering documentation."""
__version__ = "0.1.0"

View File

@@ -0,0 +1,104 @@
"""Audio analysis module - transcription via Whisper."""
from pathlib import Path
from dataclasses import dataclass
import subprocess
import tempfile
@dataclass
class TranscriptSegment:
"""A segment of transcribed audio."""
start: float # seconds
end: float
text: str
@dataclass
class Transcript:
"""Full transcript with segments."""
segments: list[TranscriptSegment]
full_text: str
def get_text_at(self, timestamp: float, window: float = 5.0) -> str:
"""Get transcript text around a specific timestamp."""
relevant = []
for seg in self.segments:
if seg.start <= timestamp + window and seg.end >= timestamp - window:
relevant.append(seg.text)
return " ".join(relevant)
class AudioAnalyzer:
"""Handles audio transcription using Whisper."""
def __init__(self, video_path: Path, model: str = "base"):
self.video_path = video_path
self.model = model
def transcribe(self) -> Transcript:
"""
Transcribe audio from video using Whisper.
Returns:
Transcript object with segments and full text
"""
# Extract audio to temp file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
audio_path = Path(f.name)
# Extract audio using ffmpeg
cmd = [
"ffmpeg", "-y",
"-i", str(self.video_path),
"-vn", "-acodec", "pcm_s16le",
"-ar", "16000", "-ac", "1",
str(audio_path)
]
subprocess.run(cmd, capture_output=True)
# Run Whisper
try:
import whisper
model = whisper.load_model(self.model)
result = model.transcribe(str(audio_path), word_timestamps=True)
segments = []
for seg in result.get("segments", []):
segments.append(TranscriptSegment(
start=seg["start"],
end=seg["end"],
text=seg["text"].strip()
))
return Transcript(
segments=segments,
full_text=result.get("text", "").strip()
)
finally:
# Cleanup temp file
audio_path.unlink(missing_ok=True)
def extract_keywords(self, transcript: Transcript) -> list[str]:
"""Extract likely component names and technical terms."""
# Simple keyword extraction - can be enhanced with NLP
keywords = []
indicator_phrases = [
"this is the", "this is a", "here we have",
"the main", "called the", "known as",
"this part", "this component", "this assembly"
]
text_lower = transcript.full_text.lower()
for phrase in indicator_phrases:
if phrase in text_lower:
# Find what comes after the phrase
idx = text_lower.find(phrase)
after = transcript.full_text[idx + len(phrase):idx + len(phrase) + 50]
# Take first few words
words = after.strip().split()[:3]
if words:
keywords.append(" ".join(words).strip(",.;:"))
return list(set(keywords))

86
src/cad_documenter/cli.py Normal file
View File

@@ -0,0 +1,86 @@
"""CAD-Documenter CLI - Main entry point."""
import click
from pathlib import Path
from rich.console import Console
from .pipeline import DocumentationPipeline
console = Console()
@click.command()
@click.argument("video", type=click.Path(exists=True, path_type=Path))
@click.option("-o", "--output", type=click.Path(path_type=Path), help="Output directory")
@click.option("--frames-only", is_flag=True, help="Only extract frames, skip documentation")
@click.option("--atomizer-hints", is_flag=True, help="Generate Atomizer FEA hints")
@click.option("--bom", is_flag=True, help="Generate Bill of Materials")
@click.option("--pdf", is_flag=True, help="Generate PDF via Atomaste Report Standard")
@click.option("--frame-interval", default=2.0, help="Seconds between frame extractions")
@click.option("--whisper-model", default="base", help="Whisper model size (tiny/base/small/medium/large)")
@click.version_option()
def main(
video: Path,
output: Path | None,
frames_only: bool,
atomizer_hints: bool,
bom: bool,
pdf: bool,
frame_interval: float,
whisper_model: str,
):
"""
Generate engineering documentation from a CAD walkthrough video.
VIDEO: Path to the video file (.mp4, .mov, .avi, etc.)
"""
console.print(f"[bold blue]CAD-Documenter[/bold blue] v0.1.0")
console.print(f"Processing: [cyan]{video}[/cyan]")
# Default output directory
if output is None:
output = video.parent / f"{video.stem}_docs"
output.mkdir(parents=True, exist_ok=True)
# Run pipeline
pipeline = DocumentationPipeline(
video_path=video,
output_dir=output,
frame_interval=frame_interval,
whisper_model=whisper_model,
)
if frames_only:
console.print("[yellow]Extracting frames only...[/yellow]")
pipeline.extract_frames()
console.print(f"[green]✓[/green] Frames saved to {output / 'frames'}")
return
# Full pipeline
console.print("[yellow]Step 1/4:[/yellow] Extracting frames...")
frames = pipeline.extract_frames()
console.print(f" [green]✓[/green] Extracted {len(frames)} frames")
console.print("[yellow]Step 2/4:[/yellow] Transcribing audio...")
transcript = pipeline.transcribe_audio()
console.print(f" [green]✓[/green] Transcribed {len(transcript.segments)} segments")
console.print("[yellow]Step 3/4:[/yellow] Analyzing components...")
analysis = pipeline.analyze_components(frames, transcript)
console.print(f" [green]✓[/green] Identified {len(analysis.components)} components")
console.print("[yellow]Step 4/4:[/yellow] Generating documentation...")
doc_path = pipeline.generate_documentation(analysis, atomizer_hints=atomizer_hints, bom=bom)
console.print(f" [green]✓[/green] Documentation saved to {doc_path}")
if pdf:
console.print("[yellow]Generating PDF...[/yellow]")
pdf_path = pipeline.generate_pdf(doc_path)
console.print(f" [green]✓[/green] PDF saved to {pdf_path}")
console.print(f"\n[bold green]Done![/bold green] Output: {output}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,218 @@
"""Documentation generator - produces markdown and PDF output."""
from pathlib import Path
from datetime import datetime
from jinja2 import Environment, FileSystemLoader, BaseLoader
from .vision_analyzer import ComponentAnalysis, Component
# Default template embedded in code (can be overridden by files)
DEFAULT_TEMPLATE = '''# {{ analysis.assembly_name }} - Technical Documentation
**Generated:** {{ timestamp }}
**Source:** Video walkthrough documentation
---
## Executive Summary
{{ analysis.summary }}
---
## Components
{% for component in analysis.components %}
### {{ loop.index }}. {{ component.name }}
{% if component.description %}
{{ component.description }}
{% endif %}
{% if component.function %}
- **Function:** {{ component.function }}
{% endif %}
{% if component.material %}
- **Material:** {{ component.material }}
{% endif %}
{% if component.part_number %}
- **Part Number:** {{ component.part_number }}
{% endif %}
{% if component.features %}
**Key Features:**
{% for feature in component.features %}
- {{ feature }}
{% endfor %}
{% endif %}
{% if component.best_frame %}
![{{ component.name }}](frames/{{ component.best_frame.path.name }})
{% endif %}
{% if component.transcript_excerpt %}
> *From walkthrough:* "{{ component.transcript_excerpt }}"
{% endif %}
---
{% endfor %}
{% if bom %}
## Bill of Materials
| Item | P/N | Name | Qty | Material | Notes |
|------|-----|------|-----|----------|-------|
{% for component in analysis.components %}
| {{ loop.index }} | {{ component.part_number or 'TBD' }} | {{ component.name }} | 1 | {{ component.material or 'TBD' }} | {{ component.function }} |
{% endfor %}
{% endif %}
{% if analysis.assembly_notes %}
## Assembly Notes
{{ analysis.assembly_notes }}
{% endif %}
{% if atomizer_hints %}
## Atomizer FEA Hints
Based on the video walkthrough, the following optimization parameters are suggested:
```json
{
"model_understanding": {
"components": {{ component_names | tojson }},
"materials_mentioned": {{ materials | tojson }}
},
"suggested_study": {
"objectives": [
{"name": "mass", "direction": "minimize"}
],
"constraints_likely": []
}
}
```
{% endif %}
---
## Raw Transcript
<details>
<summary>Click to expand full transcript</summary>
{{ analysis.raw_transcript }}
</details>
---
*Documentation generated by CAD-Documenter*
'''
class DocGenerator:
"""Generates documentation from analysis results."""
def __init__(self, output_dir: Path, template_dir: Path | None = None):
self.output_dir = output_dir
self.output_dir.mkdir(parents=True, exist_ok=True)
# Setup Jinja environment
if template_dir and template_dir.exists():
self.env = Environment(loader=FileSystemLoader(template_dir))
else:
self.env = Environment(loader=BaseLoader())
def generate(
self,
analysis: ComponentAnalysis,
atomizer_hints: bool = False,
bom: bool = False,
template_name: str | None = None,
) -> Path:
"""Generate markdown documentation."""
# Load template
if template_name:
template = self.env.get_template(f"{template_name}.md.j2")
else:
template = self.env.from_string(DEFAULT_TEMPLATE)
# Prepare template context
context = {
"analysis": analysis,
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M"),
"atomizer_hints": atomizer_hints,
"bom": bom,
"component_names": [c.name for c in analysis.components],
"materials": list(set(c.material for c in analysis.components if c.material)),
}
# Render
content = template.render(**context)
# Write output
output_path = self.output_dir / "documentation.md"
output_path.write_text(content)
return output_path
def generate_pdf(self, markdown_path: Path) -> Path:
"""
Generate PDF from markdown using Atomaste Report Standard.
Requires the atomaste-reports skill/Typst to be available.
"""
import subprocess
pdf_path = markdown_path.with_suffix(".pdf")
# Try to use Atomaste Report Standard if available
# Otherwise fall back to pandoc
try:
# Check if atomaste build script exists
build_script = Path("/home/papa/Atomaste/Templates/Atomaste_Report_Standard/scripts/build-report.py")
if build_script.exists():
cmd = ["python3", str(build_script), str(markdown_path), "-o", str(pdf_path)]
else:
# Fallback to pandoc
cmd = ["pandoc", str(markdown_path), "-o", str(pdf_path)]
subprocess.run(cmd, capture_output=True, check=True)
except subprocess.CalledProcessError as e:
raise RuntimeError(f"PDF generation failed: {e}")
return pdf_path
def generate_atomizer_hints(self, analysis: ComponentAnalysis) -> Path:
"""Generate standalone Atomizer hints JSON file."""
import json
hints = {
"model_understanding": {
"assembly_name": analysis.assembly_name,
"components": [c.name for c in analysis.components],
"materials_mentioned": list(set(c.material for c in analysis.components if c.material)),
"functions": {c.name: c.function for c in analysis.components if c.function},
},
"suggested_spec": {
"objectives": [
{"name": "mass", "direction": "minimize"}
],
"parameters_likely": [],
"constraints_likely": [],
},
"transcript_highlights": [],
}
output_path = self.output_dir / "atomizer_hints.json"
output_path.write_text(json.dumps(hints, indent=2))
return output_path

View File

@@ -0,0 +1,64 @@
"""Main documentation pipeline orchestrator."""
from pathlib import Path
from dataclasses import dataclass, field
from .video_processor import VideoProcessor, FrameInfo
from .audio_analyzer import AudioAnalyzer, Transcript
from .vision_analyzer import VisionAnalyzer, ComponentAnalysis
from .doc_generator import DocGenerator
@dataclass
class PipelineConfig:
"""Pipeline configuration."""
frame_interval: float = 2.0
whisper_model: str = "base"
vision_model: str = "gpt-4o" # or local model
@dataclass
class DocumentationPipeline:
"""Orchestrates the full documentation pipeline."""
video_path: Path
output_dir: Path
frame_interval: float = 2.0
whisper_model: str = "base"
def __post_init__(self):
self.video_processor = VideoProcessor(self.video_path, self.output_dir / "frames")
self.audio_analyzer = AudioAnalyzer(self.video_path, self.whisper_model)
self.vision_analyzer = VisionAnalyzer()
self.doc_generator = DocGenerator(self.output_dir)
def extract_frames(self) -> list[FrameInfo]:
"""Extract key frames from video."""
return self.video_processor.extract_frames(interval=self.frame_interval)
def transcribe_audio(self) -> Transcript:
"""Transcribe audio track."""
return self.audio_analyzer.transcribe()
def analyze_components(
self, frames: list[FrameInfo], transcript: Transcript
) -> ComponentAnalysis:
"""Analyze frames + transcript to identify components."""
return self.vision_analyzer.analyze(frames, transcript)
def generate_documentation(
self,
analysis: ComponentAnalysis,
atomizer_hints: bool = False,
bom: bool = False,
) -> Path:
"""Generate markdown documentation."""
return self.doc_generator.generate(
analysis,
atomizer_hints=atomizer_hints,
bom=bom,
)
def generate_pdf(self, markdown_path: Path) -> Path:
"""Generate PDF from markdown using Atomaste Report Standard."""
return self.doc_generator.generate_pdf(markdown_path)

View File

@@ -0,0 +1,112 @@
"""Video processing module - frame extraction and scene detection."""
import subprocess
import json
from pathlib import Path
from dataclasses import dataclass
@dataclass
class FrameInfo:
"""Information about an extracted frame."""
path: Path
timestamp: float # seconds
frame_number: int
class VideoProcessor:
"""Handles video frame extraction using ffmpeg."""
def __init__(self, video_path: Path, output_dir: Path):
self.video_path = video_path
self.output_dir = output_dir
self.output_dir.mkdir(parents=True, exist_ok=True)
def get_duration(self) -> float:
"""Get video duration in seconds."""
cmd = [
"ffprobe", "-v", "quiet",
"-print_format", "json",
"-show_format",
str(self.video_path)
]
result = subprocess.run(cmd, capture_output=True, text=True)
data = json.loads(result.stdout)
return float(data["format"]["duration"])
def extract_frames(self, interval: float = 2.0) -> list[FrameInfo]:
"""
Extract frames at regular intervals.
Args:
interval: Seconds between frame extractions
Returns:
List of FrameInfo objects for extracted frames
"""
duration = self.get_duration()
frames = []
# Use ffmpeg to extract frames at interval
output_pattern = self.output_dir / "frame_%04d.jpg"
cmd = [
"ffmpeg", "-y",
"-i", str(self.video_path),
"-vf", f"fps=1/{interval}",
"-q:v", "2", # High quality JPEG
str(output_pattern)
]
subprocess.run(cmd, capture_output=True)
# Collect extracted frames
for i, frame_path in enumerate(sorted(self.output_dir.glob("frame_*.jpg"))):
timestamp = i * interval
frames.append(FrameInfo(
path=frame_path,
timestamp=timestamp,
frame_number=i
))
return frames
def extract_audio(self, output_path: Path | None = None) -> Path:
"""Extract audio track from video."""
if output_path is None:
output_path = self.output_dir.parent / "audio.wav"
cmd = [
"ffmpeg", "-y",
"-i", str(self.video_path),
"-vn", # No video
"-acodec", "pcm_s16le",
"-ar", "16000", # 16kHz for Whisper
"-ac", "1", # Mono
str(output_path)
]
subprocess.run(cmd, capture_output=True)
return output_path
def detect_scene_changes(self, threshold: float = 0.3) -> list[float]:
"""
Detect scene changes in video.
Returns list of timestamps where significant visual changes occur.
"""
cmd = [
"ffmpeg", "-i", str(self.video_path),
"-vf", f"select='gt(scene,{threshold})',showinfo",
"-f", "null", "-"
]
result = subprocess.run(cmd, capture_output=True, text=True)
# Parse scene change timestamps from ffmpeg output
timestamps = []
for line in result.stderr.split("\n"):
if "pts_time:" in line:
# Extract timestamp
parts = line.split("pts_time:")
if len(parts) > 1:
ts = float(parts[1].split()[0])
timestamps.append(ts)
return timestamps

View File

@@ -0,0 +1,111 @@
"""Vision analysis module - component detection and feature extraction."""
from pathlib import Path
from dataclasses import dataclass, field
from .video_processor import FrameInfo
from .audio_analyzer import Transcript
@dataclass
class Component:
"""A detected component from the CAD model."""
name: str
description: str
function: str = ""
material: str = ""
features: list[str] = field(default_factory=list)
best_frame: FrameInfo | None = None
transcript_excerpt: str = ""
part_number: str = "" # For Part Manager integration
@dataclass
class ComponentAnalysis:
"""Complete analysis results."""
assembly_name: str
summary: str
components: list[Component]
assembly_notes: str = ""
raw_transcript: str = ""
class VisionAnalyzer:
"""Analyzes frames to identify components and features."""
def __init__(self, model: str = "gpt-4o"):
self.model = model
def analyze(
self, frames: list[FrameInfo], transcript: Transcript
) -> ComponentAnalysis:
"""
Analyze frames and transcript to identify components.
This is where the AI magic happens - correlating visual and verbal info.
"""
# For MVP, we'll use a multi-modal approach:
# 1. Send key frames to vision model with transcript context
# 2. Ask it to identify components and correlate with verbal descriptions
# Placeholder implementation - will be enhanced with actual AI calls
components = self._identify_components(frames, transcript)
summary = self._generate_summary(components, transcript)
return ComponentAnalysis(
assembly_name=self._extract_assembly_name(transcript),
summary=summary,
components=components,
assembly_notes=self._extract_assembly_notes(transcript),
raw_transcript=transcript.full_text,
)
def _identify_components(
self, frames: list[FrameInfo], transcript: Transcript
) -> list[Component]:
"""Identify individual components from frames + transcript."""
# TODO: Implement vision API calls
# For now, return empty list - will be implemented in Phase 1
return []
def _generate_summary(
self, components: list[Component], transcript: Transcript
) -> str:
"""Generate executive summary of the assembly."""
# TODO: Implement with LLM
return f"Assembly documentation generated from video walkthrough. {len(components)} components identified."
def _extract_assembly_name(self, transcript: Transcript) -> str:
"""Try to extract assembly name from transcript."""
# Look for common patterns
text = transcript.full_text.lower()
patterns = ["this is the", "presenting the", "looking at the", "reviewing the"]
for pattern in patterns:
if pattern in text:
idx = text.find(pattern) + len(pattern)
name = transcript.full_text[idx:idx + 50].strip().split(".")[0]
return name.strip()
return "Untitled Assembly"
def _extract_assembly_notes(self, transcript: Transcript) -> str:
"""Extract assembly-related notes from transcript."""
# Look for assembly instructions in transcript
keywords = ["assemble", "install", "mount", "attach", "connect"]
notes = []
for seg in transcript.segments:
if any(kw in seg.text.lower() for kw in keywords):
notes.append(seg.text)
return " ".join(notes) if notes else ""
def analyze_single_frame(self, frame: FrameInfo, context: str = "") -> dict:
"""
Analyze a single frame for components and features.
Returns dict with detected components, features, and confidence.
"""
# TODO: Implement with vision API
return {
"components": [],
"features": [],
"confidence": 0.0
}