Clarify Windows vs Clawdbot responsibilities

Windows KB Capture now:
- Records clips
- Merges video
- Transcribes with Whisper (GPU)
- Finds screenshot triggers with context
- Exports: merged.mp4, transcript.json, metadata.json

Clawdbot (via knowledge-base skill) now:
- Extracts frames at trigger timestamps
- Vision analyzes frames
- Updates KB files
- Organizes images

Removed frame extraction from Windows - that's Mario's job.
Added screenshot_triggers to metadata.json with context for Mario.
This commit is contained in:
Mario Lavoie
2026-02-09 22:05:33 +00:00
parent e647255c60
commit 09c32cbad2

View File

@@ -64,18 +64,14 @@ class SessionExporter:
self.on_progress("Merging clips...", 0.1)
merged_path = self._merge_clips(session_dir, kept_clips, export_dir)
# Step 2: Transcribe
# Step 2: Transcribe (uses local GPU Whisper)
self.on_progress("Transcribing audio...", 0.3)
transcript = self._transcribe(merged_path, export_dir)
# Step 3: Extract frames at screenshot triggers
self.on_progress("Extracting frames...", 0.7)
frames_dir = export_dir / "frames"
self._extract_frames(merged_path, transcript, frames_dir)
# Step 4: Create metadata
self.on_progress("Creating metadata...", 0.9)
self._create_metadata(session, export_dir, merged_path)
# Step 3: Create metadata
# Note: Frame extraction is done by Clawdbot using the knowledge-base skill
self.on_progress("Creating metadata...", 0.8)
self._create_metadata(session, export_dir, merged_path, transcript)
self.on_progress("Export complete!", 1.0)
@@ -230,13 +226,43 @@ class SessionExporter:
secs = int(seconds % 60)
return f"{mins:02d}-{secs:02d}"
def _find_screenshot_triggers(self, transcript: dict) -> list[dict]:
"""Find 'screenshot' triggers in transcript with context."""
triggers = []
segments = transcript.get("segments", [])
for i, segment in enumerate(segments):
text = segment.get("text", "").lower()
if "screenshot" in text:
timestamp = segment.get("start", 0)
# Get context: 2 segments before and after
context_segments = segments[max(0, i-2):i+3]
context = " ".join(s.get("text", "") for s in context_segments)
triggers.append({
"timestamp": timestamp,
"timestamp_formatted": self._format_timestamp(timestamp),
"segment_text": segment.get("text", ""),
"context": context.strip(),
})
return triggers
def _create_metadata(
self,
session: Session,
export_dir: Path,
merged_path: Path,
transcript: dict,
) -> None:
"""Create metadata.json for Clawdbot."""
# Find screenshot triggers for Mario
triggers = self._find_screenshot_triggers(transcript)
# Get video duration
duration = self._get_video_duration(merged_path)
metadata = {
"session_id": session.id,
"name": session.name,
@@ -245,7 +271,8 @@ class SessionExporter:
"created_at": session.created_at.isoformat(),
"exported_at": datetime.now().isoformat(),
"clip_count": session.clip_count,
"total_duration": session.total_duration,
"total_duration": duration,
"status": "pending", # → "processed" after Clawdbot processes
"clips": [
{
"id": clip.id,
@@ -254,10 +281,10 @@ class SessionExporter:
}
for clip in session.kept_clips
],
"screenshot_triggers": triggers, # Pre-parsed for Mario
"files": {
"video": "merged.mp4",
"transcript": "transcript.json",
"frames": "frames/",
},
}