feat(assets): binary asset store + artifact entity + wiki evidence (Issue F)
Wires visual evidence into the knowledge graph. Images, PDFs, and CAD
exports can now be uploaded, deduped by SHA-256, thumbnailed, linked to
entities via EVIDENCED_BY, and rendered inline on wiki pages. Unblocks
AKC uploading voice-session screenshots alongside extracted entities.
- assets/ module: store_asset (hash dedup + MIME allowlist + 20 MB cap),
get_asset_binary, get_thumbnail (Pillow, on-disk cache under
.thumbnails/<size>/), list_orphan_assets, invalidate_asset
- models/database.py: new `assets` table + indexes
- engineering/service.py: `artifact` added to ENTITY_TYPES
- api/routes.py: POST /assets (multipart), GET /assets/{id},
/assets/{id}/thumbnail, /assets/{id}/meta, /admin/assets/orphans,
DELETE /assets/{id} (409 if still referenced),
GET /entities/{id}/evidence (EVIDENCED_BY artifacts with asset meta)
- main.py: all new paths aliased under /v1
- engineering/wiki.py: entity pages render EVIDENCED_BY → artifact as a
"Visual evidence" thumbnail strip; artifact pages render the full
image + caption + capture_context
- deploy/dalidou/docker-compose.yml: bind-mount ${ATOCORE_ASSETS_DIR}
- config.py: assets_dir + assets_max_upload_bytes settings
- requirements.txt + pyproject.toml: python-multipart, Pillow>=10.0.0
- tests/test_assets.py: 16 tests (dedup, cap, thumbnail cache, orphan
detection, invalidate gating, API upload/fetch, evidence, v1 aliases,
wiki rendering)
- DEV-LEDGER.md: session log + cleanup note + test_count 478 -> 494
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
367
src/atocore/assets/service.py
Normal file
367
src/atocore/assets/service.py
Normal file
@@ -0,0 +1,367 @@
|
||||
"""Binary asset storage with hash-dedup and on-demand thumbnails.
|
||||
|
||||
Issue F — visual evidence. Stores uploaded images / PDFs / CAD exports
|
||||
under ``<assets_dir>/<hash[:2]>/<hash>.<ext>``. Re-uploads are idempotent
|
||||
on SHA-256. Thumbnails are generated on first request and cached under
|
||||
``<assets_dir>/.thumbnails/<size>/<hash>.jpg``.
|
||||
|
||||
Kept deliberately small: no authentication, no background jobs, no
|
||||
image transformations beyond thumbnailing. Callers (API layer) own
|
||||
MIME validation and size caps.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import uuid
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
import atocore.config as _config
|
||||
from atocore.models.database import get_connection
|
||||
from atocore.observability.logger import get_logger
|
||||
|
||||
log = get_logger("assets")
|
||||
|
||||
|
||||
# Whitelisted mime types. Start conservative; extend when a real use
|
||||
# case lands rather than speculatively.
|
||||
ALLOWED_MIME_TYPES: dict[str, str] = {
|
||||
"image/png": "png",
|
||||
"image/jpeg": "jpg",
|
||||
"image/webp": "webp",
|
||||
"image/gif": "gif",
|
||||
"application/pdf": "pdf",
|
||||
"model/step": "step",
|
||||
"model/iges": "iges",
|
||||
}
|
||||
|
||||
|
||||
class AssetError(Exception):
|
||||
"""Base class for asset errors."""
|
||||
|
||||
|
||||
class AssetTooLarge(AssetError):
|
||||
pass
|
||||
|
||||
|
||||
class AssetTypeNotAllowed(AssetError):
|
||||
pass
|
||||
|
||||
|
||||
class AssetNotFound(AssetError):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class Asset:
|
||||
id: str
|
||||
hash_sha256: str
|
||||
mime_type: str
|
||||
size_bytes: int
|
||||
stored_path: str
|
||||
width: int | None = None
|
||||
height: int | None = None
|
||||
original_filename: str = ""
|
||||
project: str = ""
|
||||
caption: str = ""
|
||||
source_refs: list[str] = field(default_factory=list)
|
||||
status: str = "active"
|
||||
created_at: str = ""
|
||||
updated_at: str = ""
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"id": self.id,
|
||||
"hash_sha256": self.hash_sha256,
|
||||
"mime_type": self.mime_type,
|
||||
"size_bytes": self.size_bytes,
|
||||
"width": self.width,
|
||||
"height": self.height,
|
||||
"stored_path": self.stored_path,
|
||||
"original_filename": self.original_filename,
|
||||
"project": self.project,
|
||||
"caption": self.caption,
|
||||
"source_refs": self.source_refs,
|
||||
"status": self.status,
|
||||
"created_at": self.created_at,
|
||||
"updated_at": self.updated_at,
|
||||
}
|
||||
|
||||
|
||||
def _assets_root() -> Path:
|
||||
root = _config.settings.resolved_assets_dir
|
||||
root.mkdir(parents=True, exist_ok=True)
|
||||
return root
|
||||
|
||||
|
||||
def _blob_path(hash_sha256: str, ext: str) -> Path:
|
||||
root = _assets_root()
|
||||
return root / hash_sha256[:2] / f"{hash_sha256}.{ext}"
|
||||
|
||||
|
||||
def _thumbnails_root() -> Path:
|
||||
return _assets_root() / ".thumbnails"
|
||||
|
||||
|
||||
def _thumbnail_path(hash_sha256: str, size: int) -> Path:
|
||||
return _thumbnails_root() / str(size) / f"{hash_sha256}.jpg"
|
||||
|
||||
|
||||
def _image_dimensions(data: bytes, mime_type: str) -> tuple[int | None, int | None]:
|
||||
if not mime_type.startswith("image/"):
|
||||
return None, None
|
||||
try:
|
||||
from PIL import Image
|
||||
except Exception:
|
||||
return None, None
|
||||
try:
|
||||
with Image.open(BytesIO(data)) as img:
|
||||
return img.width, img.height
|
||||
except Exception as e:
|
||||
log.warning("asset_dimension_probe_failed", error=str(e))
|
||||
return None, None
|
||||
|
||||
|
||||
def store_asset(
|
||||
data: bytes,
|
||||
mime_type: str,
|
||||
original_filename: str = "",
|
||||
project: str = "",
|
||||
caption: str = "",
|
||||
source_refs: list[str] | None = None,
|
||||
) -> Asset:
|
||||
"""Persist a binary blob and return the catalog row.
|
||||
|
||||
Idempotent on SHA-256 — a re-upload returns the existing asset row
|
||||
without rewriting the blob or creating a duplicate catalog entry.
|
||||
Caption / project / source_refs on re-upload are ignored; update
|
||||
those via the owning entity's properties instead.
|
||||
"""
|
||||
max_bytes = _config.settings.assets_max_upload_bytes
|
||||
if len(data) > max_bytes:
|
||||
raise AssetTooLarge(
|
||||
f"Upload is {len(data)} bytes; limit is {max_bytes} bytes"
|
||||
)
|
||||
if mime_type not in ALLOWED_MIME_TYPES:
|
||||
raise AssetTypeNotAllowed(
|
||||
f"mime_type {mime_type!r} not in allowlist. "
|
||||
f"Allowed: {sorted(ALLOWED_MIME_TYPES)}"
|
||||
)
|
||||
|
||||
hash_sha256 = hashlib.sha256(data).hexdigest()
|
||||
ext = ALLOWED_MIME_TYPES[mime_type]
|
||||
|
||||
# Idempotency — if we already have this hash, return the existing row.
|
||||
existing = _fetch_by_hash(hash_sha256)
|
||||
if existing is not None:
|
||||
log.info("asset_dedup_hit", asset_id=existing.id, hash=hash_sha256[:12])
|
||||
return existing
|
||||
|
||||
width, height = _image_dimensions(data, mime_type)
|
||||
|
||||
blob_path = _blob_path(hash_sha256, ext)
|
||||
blob_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
blob_path.write_bytes(data)
|
||||
|
||||
asset_id = str(uuid.uuid4())
|
||||
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
|
||||
refs = source_refs or []
|
||||
|
||||
with get_connection() as conn:
|
||||
conn.execute(
|
||||
"""INSERT INTO assets
|
||||
(id, hash_sha256, mime_type, size_bytes, width, height,
|
||||
stored_path, original_filename, project, caption,
|
||||
source_refs, status, created_at, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 'active', ?, ?)""",
|
||||
(
|
||||
asset_id, hash_sha256, mime_type, len(data), width, height,
|
||||
str(blob_path), original_filename, project, caption,
|
||||
json.dumps(refs), now, now,
|
||||
),
|
||||
)
|
||||
|
||||
log.info(
|
||||
"asset_stored", asset_id=asset_id, hash=hash_sha256[:12],
|
||||
mime_type=mime_type, size_bytes=len(data),
|
||||
)
|
||||
return Asset(
|
||||
id=asset_id, hash_sha256=hash_sha256, mime_type=mime_type,
|
||||
size_bytes=len(data), width=width, height=height,
|
||||
stored_path=str(blob_path), original_filename=original_filename,
|
||||
project=project, caption=caption, source_refs=refs,
|
||||
status="active", created_at=now, updated_at=now,
|
||||
)
|
||||
|
||||
|
||||
def _fetch_by_hash(hash_sha256: str) -> Asset | None:
|
||||
with get_connection() as conn:
|
||||
row = conn.execute(
|
||||
"SELECT * FROM assets WHERE hash_sha256 = ? AND status != 'invalid'",
|
||||
(hash_sha256,),
|
||||
).fetchone()
|
||||
return _row_to_asset(row) if row else None
|
||||
|
||||
|
||||
def get_asset(asset_id: str) -> Asset | None:
|
||||
with get_connection() as conn:
|
||||
row = conn.execute(
|
||||
"SELECT * FROM assets WHERE id = ?", (asset_id,)
|
||||
).fetchone()
|
||||
return _row_to_asset(row) if row else None
|
||||
|
||||
|
||||
def get_asset_binary(asset_id: str) -> tuple[Asset, bytes]:
|
||||
"""Return (metadata, raw bytes). Raises AssetNotFound."""
|
||||
asset = get_asset(asset_id)
|
||||
if asset is None or asset.status == "invalid":
|
||||
raise AssetNotFound(f"Asset not found: {asset_id}")
|
||||
path = Path(asset.stored_path)
|
||||
if not path.exists():
|
||||
raise AssetNotFound(
|
||||
f"Asset {asset_id} row exists but blob is missing at {path}"
|
||||
)
|
||||
return asset, path.read_bytes()
|
||||
|
||||
|
||||
def get_thumbnail(asset_id: str, size: int = 240) -> tuple[Asset, bytes]:
|
||||
"""Return (metadata, thumbnail JPEG bytes).
|
||||
|
||||
Thumbnails are only generated for image mime types. For non-images
|
||||
the caller should render a placeholder instead. Generated thumbs
|
||||
are cached on disk at ``<assets_dir>/.thumbnails/<size>/<hash>.jpg``.
|
||||
"""
|
||||
asset = get_asset(asset_id)
|
||||
if asset is None or asset.status == "invalid":
|
||||
raise AssetNotFound(f"Asset not found: {asset_id}")
|
||||
if not asset.mime_type.startswith("image/"):
|
||||
raise AssetError(
|
||||
f"Thumbnails are only supported for images; "
|
||||
f"{asset.mime_type!r} is not an image"
|
||||
)
|
||||
|
||||
size = max(16, min(int(size), 2048))
|
||||
thumb_path = _thumbnail_path(asset.hash_sha256, size)
|
||||
if thumb_path.exists():
|
||||
return asset, thumb_path.read_bytes()
|
||||
|
||||
try:
|
||||
from PIL import Image
|
||||
except Exception as e:
|
||||
raise AssetError(f"Pillow not available for thumbnailing: {e}")
|
||||
|
||||
src_path = Path(asset.stored_path)
|
||||
if not src_path.exists():
|
||||
raise AssetNotFound(
|
||||
f"Asset {asset_id} row exists but blob is missing at {src_path}"
|
||||
)
|
||||
|
||||
thumb_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with Image.open(src_path) as img:
|
||||
img = img.convert("RGB") if img.mode not in ("RGB", "L") else img
|
||||
img.thumbnail((size, size))
|
||||
buf = BytesIO()
|
||||
img.save(buf, format="JPEG", quality=85, optimize=True)
|
||||
jpeg_bytes = buf.getvalue()
|
||||
thumb_path.write_bytes(jpeg_bytes)
|
||||
return asset, jpeg_bytes
|
||||
|
||||
|
||||
def list_orphan_assets(limit: int = 200) -> list[Asset]:
|
||||
"""Assets not referenced by any active entity or memory.
|
||||
|
||||
"Referenced" means: an active entity has ``properties.asset_id``
|
||||
pointing at this asset, OR any active entity / memory's
|
||||
source_refs contains ``asset:<id>``.
|
||||
"""
|
||||
with get_connection() as conn:
|
||||
asset_rows = conn.execute(
|
||||
"SELECT * FROM assets WHERE status = 'active' "
|
||||
"ORDER BY created_at DESC LIMIT ?",
|
||||
(min(limit, 1000),),
|
||||
).fetchall()
|
||||
|
||||
entities_with_asset = set()
|
||||
rows = conn.execute(
|
||||
"SELECT properties, source_refs FROM entities "
|
||||
"WHERE status = 'active'"
|
||||
).fetchall()
|
||||
for r in rows:
|
||||
try:
|
||||
props = json.loads(r["properties"] or "{}")
|
||||
aid = props.get("asset_id")
|
||||
if aid:
|
||||
entities_with_asset.add(aid)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
refs = json.loads(r["source_refs"] or "[]")
|
||||
for ref in refs:
|
||||
if isinstance(ref, str) and ref.startswith("asset:"):
|
||||
entities_with_asset.add(ref.split(":", 1)[1])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Memories don't have a properties dict, but source_refs may carry
|
||||
# asset:<id> after Issue F lands for memory-level evidence.
|
||||
# The memories table has no source_refs column today — skip here
|
||||
# and extend once that lands.
|
||||
|
||||
return [
|
||||
_row_to_asset(r)
|
||||
for r in asset_rows
|
||||
if r["id"] not in entities_with_asset
|
||||
]
|
||||
|
||||
|
||||
def invalidate_asset(asset_id: str, actor: str = "api", note: str = "") -> bool:
|
||||
"""Tombstone an asset. No-op if still referenced.
|
||||
|
||||
Returns True on success, False if the asset is missing or still
|
||||
referenced by an active entity (caller should get a 409 in that
|
||||
case). The blob file stays on disk until a future gc pass sweeps
|
||||
orphaned blobs — this function only flips the catalog status.
|
||||
"""
|
||||
asset = get_asset(asset_id)
|
||||
if asset is None:
|
||||
return False
|
||||
orphans = list_orphan_assets(limit=1000)
|
||||
if asset.id not in {o.id for o in orphans} and asset.status == "active":
|
||||
log.info("asset_invalidate_blocked_referenced", asset_id=asset_id)
|
||||
return False
|
||||
|
||||
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
|
||||
with get_connection() as conn:
|
||||
conn.execute(
|
||||
"UPDATE assets SET status = 'invalid', updated_at = ? WHERE id = ?",
|
||||
(now, asset_id),
|
||||
)
|
||||
log.info("asset_invalidated", asset_id=asset_id, actor=actor, note=note[:80])
|
||||
return True
|
||||
|
||||
|
||||
def _row_to_asset(row) -> Asset:
|
||||
try:
|
||||
refs = json.loads(row["source_refs"] or "[]")
|
||||
except Exception:
|
||||
refs = []
|
||||
return Asset(
|
||||
id=row["id"],
|
||||
hash_sha256=row["hash_sha256"],
|
||||
mime_type=row["mime_type"],
|
||||
size_bytes=row["size_bytes"],
|
||||
width=row["width"],
|
||||
height=row["height"],
|
||||
stored_path=row["stored_path"],
|
||||
original_filename=row["original_filename"] or "",
|
||||
project=row["project"] or "",
|
||||
caption=row["caption"] or "",
|
||||
source_refs=refs,
|
||||
status=row["status"],
|
||||
created_at=row["created_at"] or "",
|
||||
updated_at=row["updated_at"] or "",
|
||||
)
|
||||
Reference in New Issue
Block a user