fieldwitness/src/soosef/verisoo/models.py

"""
Core data models for Verisoo.

Designed for the photographer provenance use case:
"I took this photo in Ukraine, not San Francisco - here's my cryptographic proof."
"""

from __future__ import annotations

import hashlib
import json
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import Any


@dataclass(frozen=True)
class Identity:
    """
    An attestor identity backed by Ed25519 keypair.

    The fingerprint is the first 16 bytes of SHA-256(public_key), hex-encoded.
    This provides a short, recognizable identifier without exposing the full key.
    """

    public_key: bytes
    fingerprint: str
    created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
    metadata: dict[str, Any] = field(default_factory=dict)

    @property
    def display_name(self) -> str:
        """Human-readable name if set, otherwise short fingerprint."""
        return self.metadata.get("name", f"{self.fingerprint[:8]}...")

    def __str__(self) -> str:
        return f"Identity({self.display_name})"


@dataclass(frozen=True)
class GeoLocation:
    """
    Geographic location where image was captured.

    GPS coordinates with optional accuracy and altitude.
    """

    latitude: float  # -90 to 90
    longitude: float  # -180 to 180
    accuracy_meters: float | None = None  # GPS accuracy
    altitude_meters: float | None = None
    location_name: str | None = None  # "Kyiv, Ukraine" - human readable

    def to_dict(self) -> dict[str, Any]:
        d: dict[str, Any] = {
            "lat": self.latitude,
            "lon": self.longitude,
        }
        if self.accuracy_meters is not None:
            d["accuracy"] = self.accuracy_meters
        if self.altitude_meters is not None:
            d["altitude"] = self.altitude_meters
        if self.location_name:
            d["name"] = self.location_name
        return d

    @classmethod
    def from_dict(cls, d: dict[str, Any]) -> GeoLocation:
        return cls(
            latitude=d["lat"],
            longitude=d["lon"],
            accuracy_meters=d.get("accuracy"),
            altitude_meters=d.get("altitude"),
            location_name=d.get("name"),
        )

    def __str__(self) -> str:
        if self.location_name:
            return f"{self.location_name} ({self.latitude:.4f}, {self.longitude:.4f})"
        return f"({self.latitude:.4f}, {self.longitude:.4f})"


@dataclass(frozen=True)
class CaptureDevice:
    """
    Information about the device that captured the image.

    Helps establish authenticity - "taken with iPhone 15 Pro" vs "photoshopped".
    """

    make: str | None = None  # "Apple"
    model: str | None = None  # "iPhone 15 Pro"
    software: str | None = None  # "iOS 17.4"
    serial_hash: str | None = None  # Hash of device serial (privacy-preserving)

    def to_dict(self) -> dict[str, Any]:
        d: dict[str, Any] = {}
        if self.make:
            d["make"] = self.make
        if self.model:
            d["model"] = self.model
        if self.software:
            d["software"] = self.software
        if self.serial_hash:
            d["serial_hash"] = self.serial_hash
        return d

    @classmethod
    def from_dict(cls, d: dict[str, Any]) -> CaptureDevice:
        return cls(
            make=d.get("make"),
            model=d.get("model"),
            software=d.get("software"),
            serial_hash=d.get("serial_hash"),
        )

    def __str__(self) -> str:
        parts = [p for p in [self.make, self.model] if p]
        return " ".join(parts) if parts else "Unknown device"


@dataclass(frozen=True)
class CaptureMetadata:
    """
    Rich metadata about image capture for provenance.

    This is what lets a photographer say "I took this in Kyiv, not San Francisco."
    """

    # When was it actually captured (from device clock/EXIF, not attestation time)
    captured_at: datetime | None = None

    # Where was it captured
    location: GeoLocation | None = None

    # What device captured it
    device: CaptureDevice | None = None

    # Photographer's notes at capture time
    caption: str | None = None

    # Image technical details
    width: int | None = None
    height: int | None = None
    mime_type: str | None = None

    # Original filename
    filename: str | None = None

    # Free-form tags
    tags: list[str] = field(default_factory=list)

    def to_dict(self) -> dict[str, Any]:
        d: dict[str, Any] = {}
        if self.captured_at:
            d["captured_at"] = self.captured_at.isoformat()
        if self.location:
            d["location"] = self.location.to_dict()
        if self.device:
            d["device"] = self.device.to_dict()
        if self.caption:
            d["caption"] = self.caption
        if self.width:
            d["width"] = self.width
        if self.height:
            d["height"] = self.height
        if self.mime_type:
            d["mime_type"] = self.mime_type
        if self.filename:
            d["filename"] = self.filename
        if self.tags:
            d["tags"] = self.tags
        return d

    @classmethod
    def from_dict(cls, d: dict[str, Any]) -> CaptureMetadata:
        return cls(
            captured_at=datetime.fromisoformat(d["captured_at"]) if d.get("captured_at") else None,
            location=GeoLocation.from_dict(d["location"]) if d.get("location") else None,
            device=CaptureDevice.from_dict(d["device"]) if d.get("device") else None,
            caption=d.get("caption"),
            width=d.get("width"),
            height=d.get("height"),
            mime_type=d.get("mime_type"),
            filename=d.get("filename"),
            tags=d.get("tags", []),
        )


@dataclass(frozen=True)
class ImageHashes:
    """
    Multi-algorithm image fingerprinting for robust matching.

    Designed to survive social media mangling:
    - JPEG recompression
    - Resizing
    - Format conversion
    - Cropping
    - Color adjustments

    Match if ANY hash is within threshold - defense in depth.
    """

    sha256: str  # Exact match only - rarely survives sharing
    phash: str  # DCT-based perceptual hash - survives compression
    dhash: str  # Difference hash - survives resizing
    ahash: str | None = None  # Average hash - very tolerant
    colorhash: str | None = None  # Color distribution - survives crops
    crop_resistant: str | None = None  # Center-region hash

    def matches_exactly(self, other: ImageHashes) -> bool:
        """Exact byte-for-byte match."""
        return self.sha256 == other.sha256

    def matches_perceptually(
        self,
        other: ImageHashes,
        threshold: int = 10,
    ) -> tuple[bool, str | None]:
        """
        Check if images match perceptually.

        Returns (matches, best_matching_algorithm).
        Uses multiple algorithms - match if ANY passes.
        """
        # Check each hash type, return first match
        checks = [
            ("phash", self.phash, other.phash),
            ("dhash", self.dhash, other.dhash),
        ]

        if self.ahash and other.ahash:
            checks.append(("ahash", self.ahash, other.ahash))
        if self.colorhash and other.colorhash:
            checks.append(("colorhash", self.colorhash, other.colorhash))
        if self.crop_resistant and other.crop_resistant:
            checks.append(("crop_resistant", self.crop_resistant, other.crop_resistant))

        for name, h1, h2 in checks:
            if h1 and h2:
                distance = _hamming_distance(h1, h2)
                if distance <= threshold:
                    return True, name

        return False, None

    def to_dict(self) -> dict[str, Any]:
        d = {
            "sha256": self.sha256,
            "phash": self.phash,
            "dhash": self.dhash,
        }
        if self.ahash:
            d["ahash"] = self.ahash
        if self.colorhash:
            d["colorhash"] = self.colorhash
        if self.crop_resistant:
            d["crop_resistant"] = self.crop_resistant
        return d

    @classmethod
    def from_dict(cls, d: dict[str, Any]) -> ImageHashes:
        return cls(
            sha256=d["sha256"],
            phash=d["phash"],
            dhash=d["dhash"],
            ahash=d.get("ahash"),
            colorhash=d.get("colorhash"),
            crop_resistant=d.get("crop_resistant"),
        )


@dataclass(frozen=True)
class AttestationRecord:
    """
    The core attestation record stored in the append-only log.

    This is the cryptographic proof that a specific image existed
    at a specific time, attested by a specific identity, with specific metadata.

    Once in the log, it cannot be modified or deleted.
    """

    image_hashes: ImageHashes
    signature: bytes
    attestor_fingerprint: str
    timestamp: datetime  # When attestation was created
    metadata: dict[str, Any] = field(default_factory=dict)  # CaptureMetadata.to_dict()

    @property
    def record_id(self) -> str:
        """
        Unique identifier for this record.

        SHA-256 of (sha256 || attestor_fingerprint || timestamp_iso)[:32].
        Deterministic, collision-resistant, URL-safe.
        """
        content = f"{self.image_hashes.sha256}|{self.attestor_fingerprint}|{self.timestamp.isoformat()}"
        return hashlib.sha256(content.encode()).hexdigest()[:32]

    @property
    def short_id(self) -> str:
        """Short ID for display/URLs (first 12 chars)."""
        return self.record_id[:12]

    @property
    def capture_metadata(self) -> CaptureMetadata | None:
        """Parse metadata as CaptureMetadata if present."""
        if not self.metadata:
            return None
        try:
            return CaptureMetadata.from_dict(self.metadata)
        except (KeyError, TypeError):
            return None

    @property
    def location(self) -> GeoLocation | None:
        """Shortcut to capture location."""
        cm = self.capture_metadata
        return cm.location if cm else None

    @property
    def captured_at(self) -> datetime | None:
        """Shortcut to capture time (may differ from attestation time)."""
        cm = self.capture_metadata
        return cm.captured_at if cm else None

    def to_bytes(self) -> bytes:
        """Serialize for signing/hashing."""
        data = {
            "hashes": self.image_hashes.to_dict(),
            "attestor": self.attestor_fingerprint,
            "timestamp": self.timestamp.isoformat(),
            "metadata": self.metadata,
        }
        return json.dumps(data, sort_keys=True, separators=(",", ":")).encode()

    @classmethod
    def from_bytes(cls, data: bytes, signature: bytes) -> AttestationRecord:
        """Deserialize from wire format."""
        obj = json.loads(data.decode())
        return cls(
            image_hashes=ImageHashes.from_dict(obj["hashes"]),
            signature=signature,
            attestor_fingerprint=obj["attestor"],
            timestamp=datetime.fromisoformat(obj["timestamp"]),
            metadata=obj.get("metadata", {}),
        )


@dataclass
class Attestation:
    """
    Full attestation including the image data (for creation/verification).

    This is the "working" object - AttestationRecord is what gets stored/transmitted.
    """

    image_data: bytes
    image_hashes: ImageHashes
    record: AttestationRecord | None = None

    @property
    def is_attested(self) -> bool:
        return self.record is not None


@dataclass(frozen=True)
class VerificationResult:
    """
    Result of verifying an image against attestation records.

    Tells you: "Yes, this image was attested by @photographer in Kyiv on 2024-03-15"
    """

    is_valid: bool
    match_type: str | None  # "exact", "phash", "dhash", etc.
    attestor: Identity | None
    record: AttestationRecord | None
    error: str | None = None

    # Match quality details
    hash_distances: dict[str, int] = field(default_factory=dict)

    @property
    def location(self) -> GeoLocation | None:
        """Where was the original image taken?"""
        return self.record.location if self.record else None

    @property
    def captured_at(self) -> datetime | None:
        """When was the original image captured?"""
        return self.record.captured_at if self.record else None

    @property
    def attested_at(self) -> datetime | None:
        """When was the attestation created?"""
        return self.record.timestamp if self.record else None

    def summary(self) -> str:
        """Human-readable summary for display."""
        if not self.is_valid:
            return f"Not verified: {self.error or 'No matching attestation found'}"

        parts = []
        if self.attestor:
            parts.append(f"Attested by {self.attestor.display_name}")
        if self.attested_at:
            parts.append(f"on {self.attested_at.strftime('%Y-%m-%d')}")
        if self.location:
            parts.append(f"in {self.location}")
        if self.match_type and self.match_type != "exact":
            parts.append(f"({self.match_type} match)")

        return " ".join(parts) if parts else "Verified"


@dataclass(frozen=True)
class ProofLink:
    """
    A shareable link to an attestation proof.

    Photographers can share these to prove provenance:
    "Here's proof I took this photo: verisoo.io/v/a8f3c2d1e9b7"
    """

    record_id: str
    base_url: str = "https://verisoo.io"

    @property
    def short_id(self) -> str:
        return self.record_id[:12]

    @property
    def url(self) -> str:
        return f"{self.base_url}/v/{self.short_id}"

    @property
    def full_url(self) -> str:
        return f"{self.base_url}/verify/{self.record_id}"

    def __str__(self) -> str:
        return self.url


def _hamming_distance(hash1: str, hash2: str) -> int:
    """Compute hamming distance between two hex-encoded hashes."""
    # Handle different length hashes by padding shorter one
    if len(hash1) != len(hash2):
        max_len = max(len(hash1), len(hash2))
        hash1 = hash1.zfill(max_len)
        hash2 = hash2.zfill(max_len)

    # Convert hex to int, XOR, count bits
    val1 = int(hash1, 16)
    val2 = int(hash2, 16)
    return bin(val1 ^ val2).count("1")