"""Markdown file parsing with frontmatter extraction.""" import re from dataclasses import dataclass, field from pathlib import Path import frontmatter @dataclass class ParsedDocument: file_path: str title: str body: str tags: list[str] = field(default_factory=list) frontmatter: dict = field(default_factory=dict) headings: list[tuple[int, str]] = field(default_factory=list) def parse_markdown(file_path: Path, text: str | None = None) -> ParsedDocument: """Parse a markdown file, extracting frontmatter and structure.""" raw_text = text if text is not None else file_path.read_text(encoding="utf-8") post = frontmatter.loads(raw_text) meta = dict(post.metadata) if post.metadata else {} body = post.content.strip() # Extract title: first H1, or filename title = _extract_title(body, file_path) # Extract tags from frontmatter tags = meta.get("tags", []) if isinstance(tags, str): tags = [t.strip() for t in tags.split(",") if t.strip()] tags = tags or [] # Extract heading structure headings = _extract_headings(body) return ParsedDocument( file_path=str(file_path.resolve()), title=title, body=body, tags=tags, frontmatter=meta, headings=headings, ) def _extract_title(body: str, file_path: Path) -> str: """Get title from first H1 or fallback to filename.""" match = re.search(r"^#\s+(.+)$", body, re.MULTILINE) if match: return match.group(1).strip() return file_path.stem.replace("_", " ").replace("-", " ").title() def _extract_headings(body: str) -> list[tuple[int, str]]: """Extract all headings with their level.""" headings = [] for match in re.finditer(r"^(#{1,4})\s+(.+)$", body, re.MULTILINE): level = len(match.group(1)) text = match.group(2).strip() headings.append((level, text)) return headings