fieldwitness/src/fieldwitness/attest/attestation.py

"""
Attestation Creation Module for Attest.

This module is the core of Attest's provenance system. An attestation is a
cryptographic proof that binds together:

    1. AN IMAGE     - identified by multiple hashes (SHA-256 + perceptual)
    2. AN IDENTITY  - the attestor's Ed25519 public key fingerprint
    3. A TIMESTAMP  - when the attestation was created
    4. METADATA     - optional context (location, device, caption, etc.)

The binding is secured by an Ed25519 digital signature over all components,
making it computationally infeasible to forge or modify after creation.

Architecture Overview:
----------------------

    ┌─────────────┐     ┌─────────────┐     ┌─────────────┐
    │  Image Data │     │  Private    │     │  Metadata   │
    │  (bytes)    │     │  Key        │     │  (optional) │
    └──────┬──────┘     └──────┬──────┘     └──────┬──────┘
           │                   │                   │
           ▼                   │                   ▼
    ┌─────────────┐            │            ┌─────────────┐
    │  Compute    │            │            │  Extract    │
    │  Hashes     │            │            │  EXIF       │
    │  (SHA+pHash)│            │            │  (auto)     │
    └──────┬──────┘            │            └──────┬──────┘
           │                   │                   │
           └───────────┬───────┴───────────┬───────┘
                       │                   │
                       ▼                   ▼
                ┌─────────────────────────────────┐
                │  Build Canonical Signing Payload │
                │  (deterministic byte string)     │
                └───────────────┬─────────────────┘
                                │
                                ▼
                        ┌───────────────┐
                        │  Ed25519 Sign │
                        └───────┬───────┘
                                │
                                ▼
                        ┌───────────────┐
                        │  Attestation  │
                        │  Record       │
                        └───────────────┘

Security Properties:
-------------------
- **Authenticity**: Only the private key holder can create valid signatures
- **Integrity**: Any modification invalidates the signature
- **Non-repudiation**: Attestor cannot deny creating the attestation
- **Timestamping**: Proves the image existed at attestation time

Usage Example:
-------------
    from .attestation import create_attestation
    from .crypto import load_private_key

    # Load attestor's private key
    private_key = load_private_key("~/.attest/private.pem")

    # Create attestation with auto EXIF extraction
    attestation = create_attestation(
        image_data=open("photo.jpg", "rb").read(),
        private_key=private_key,
        metadata={"caption": "Street scene in Kyiv"},
    )

    # The attestation.record can now be stored in the append-only log
"""

from __future__ import annotations

from datetime import datetime, timezone
from typing import Any

from cryptography.hazmat.primitives.asymmetric.ed25519 import Ed25519PrivateKey

from .crypto import create_identity, sign
from .hashing import hash_image
from .models import (
    Attestation,
    AttestationRecord,
    CaptureDevice,
    CaptureMetadata,
    GeoLocation,
    ImageHashes,
)


# =============================================================================
# EXIF METADATA EXTRACTION
# =============================================================================
#
# EXIF (Exchangeable Image File Format) is metadata embedded in images by
# cameras and phones. It contains valuable provenance information:
#   - When the photo was taken (DateTimeOriginal)
#   - Where it was taken (GPS coordinates)
#   - What device captured it (Make, Model)
#
# We extract this automatically to enrich attestations, but user-provided
# metadata always takes precedence (in case EXIF is wrong or missing).
# =============================================================================


def extract_exif_metadata(image_data: bytes) -> dict[str, Any]:
    """
    Extract EXIF metadata from image bytes for attestation enrichment.

    This function parses embedded EXIF data from JPEG/TIFF images to extract
    provenance-relevant information. The extracted data provides evidence of
    when, where, and how an image was captured.

    Extraction Priority:
    -------------------
    1. **Timestamp**: DateTimeOriginal > DateTimeDigitized > DateTime
       - DateTimeOriginal: When shutter was pressed (most reliable)
       - DateTimeDigitized: When image was digitized (scanner use case)
       - DateTime: Last modification time (least reliable for capture)

    2. **Location**: GPS coordinates with hemisphere references
       - Converted from DMS (degrees/minutes/seconds) to decimal degrees
       - Hemisphere (N/S, E/W) determines sign

    3. **Device**: Camera/phone identification
       - Make: Manufacturer (e.g., "Apple", "Canon")
       - Model: Device model (e.g., "iPhone 15 Pro", "EOS R5")
       - Software: Firmware/app version

    4. **Dimensions**: Image width and height in pixels

    Args:
        image_data: Raw image bytes (JPEG, TIFF, or other EXIF-capable format)

    Returns:
        Dictionary with extracted fields. May contain:
        - captured_at: datetime object (UTC)
        - location: GeoLocation object
        - device: CaptureDevice object
        - width: int (pixels)
        - height: int (pixels)

        Returns empty dict if:
        - exifread library not installed
        - Image has no EXIF data
        - EXIF parsing fails

    Note:
        Uses 'exifread' library (optional dependency). Gracefully degrades
        to empty dict if unavailable, allowing attestations without EXIF.

    Example:
        >>> exif = extract_exif_metadata(jpeg_bytes)
        >>> if exif.get("location"):
        ...     print(f"Photo taken at {exif['location']}")
    """
    # -------------------------------------------------------------------------
    # Import exifread lazily - it's an optional dependency
    # If not installed, we simply skip EXIF extraction (graceful degradation)
    # -------------------------------------------------------------------------
    try:
        import io

        import exifread
    except ImportError:
        # exifread not installed - return empty dict, attestation still works
        return {}

    # -------------------------------------------------------------------------
    # Parse EXIF tags from image bytes
    # details=False skips thumbnail and MakerNote (faster, less memory)
    # -------------------------------------------------------------------------
    try:
        tags = exifread.process_file(io.BytesIO(image_data), details=False)
    except Exception:
        # Malformed EXIF or unsupported format - continue without EXIF
        return {}

    if not tags:
        return {}

    result: dict[str, Any] = {}

    # -------------------------------------------------------------------------
    # TIMESTAMP EXTRACTION
    # Try multiple EXIF date fields in order of reliability for capture time
    # EXIF stores dates as strings: "YYYY:MM:DD HH:MM:SS" (note colons in date)
    # -------------------------------------------------------------------------
    for tag_name in ["EXIF DateTimeOriginal", "EXIF DateTimeDigitized", "Image DateTime"]:
        if tag_name in tags:
            try:
                dt_str = str(tags[tag_name])
                # Parse EXIF date format (colons in date part, space separator)
                dt = datetime.strptime(dt_str, "%Y:%m:%d %H:%M:%S")
                # Assume UTC if no timezone info (EXIF doesn't store timezone)
                result["captured_at"] = dt.replace(tzinfo=timezone.utc)
                break  # Use first valid timestamp found
            except (ValueError, TypeError):
                continue  # Malformed date string, try next field

    # -------------------------------------------------------------------------
    # GPS COORDINATE EXTRACTION
    # EXIF stores GPS as rational numbers in DMS (degrees, minutes, seconds)
    # with separate reference tags for hemisphere (N/S for lat, E/W for lon)
    # -------------------------------------------------------------------------
    gps_lat = tags.get("GPS GPSLatitude")
    gps_lat_ref = tags.get("GPS GPSLatitudeRef")  # "N" or "S"
    gps_lon = tags.get("GPS GPSLongitude")
    gps_lon_ref = tags.get("GPS GPSLongitudeRef")  # "E" or "W"

    if gps_lat and gps_lon:
        try:
            # Convert DMS to decimal degrees
            lat = _parse_gps_coord(gps_lat.values)
            lon = _parse_gps_coord(gps_lon.values)

            # Apply hemisphere sign (South and West are negative)
            if gps_lat_ref and str(gps_lat_ref) == "S":
                lat = -lat
            if gps_lon_ref and str(gps_lon_ref) == "W":
                lon = -lon

            result["location"] = GeoLocation(latitude=lat, longitude=lon)
        except (ValueError, TypeError, AttributeError):
            # Malformed GPS data - skip location rather than fail
            pass

    # -------------------------------------------------------------------------
    # DEVICE INFORMATION EXTRACTION
    # Identifies the camera/phone that captured the image
    # Useful for establishing authenticity ("taken with iPhone, not Photoshop")
    # -------------------------------------------------------------------------
    make = tags.get("Image Make")      # e.g., "Apple", "Canon", "Nikon"
    model = tags.get("Image Model")    # e.g., "iPhone 15 Pro", "EOS R5"
    software = tags.get("Image Software")  # e.g., "iOS 17.4", "Lightroom"

    if make or model or software:
        result["device"] = CaptureDevice(
            make=str(make).strip() if make else None,
            model=str(model).strip() if model else None,
            software=str(software).strip() if software else None,
        )

    # -------------------------------------------------------------------------
    # IMAGE DIMENSIONS
    # May differ from actual pixel dimensions if image was resized after EXIF
    # -------------------------------------------------------------------------
    width = tags.get("EXIF ExifImageWidth") or tags.get("Image ImageWidth")
    height = tags.get("EXIF ExifImageLength") or tags.get("Image ImageLength")

    if width:
        try:
            result["width"] = int(str(width))
        except (ValueError, TypeError):
            pass
    if height:
        try:
            result["height"] = int(str(height))
        except (ValueError, TypeError):
            pass

    return result


def _parse_gps_coord(coord_values: list) -> float:
    """
    Convert EXIF GPS coordinate from DMS to decimal degrees.

    EXIF stores GPS coordinates as three rational numbers representing
    degrees, minutes, and seconds. Each rational has a numerator and
    denominator (to handle fractional seconds precisely).

    Formula: decimal = degrees + (minutes / 60) + (seconds / 3600)

    Args:
        coord_values: List of three IfdTag Ratio objects [degrees, minutes, seconds]
                     Each has .num (numerator) and .den (denominator) attributes

    Returns:
        Decimal degrees as float (always positive; caller applies hemisphere sign)

    Example:
        GPS coords "50° 27' 0.36"" stored as:
        [Ratio(50/1), Ratio(27/1), Ratio(36/100)]
        Returns: 50.45010 (50 + 27/60 + 0.36/3600)
    """
    # Extract degrees, minutes, seconds as floats from rational values
    d = float(coord_values[0].num) / float(coord_values[0].den)  # Degrees
    m = float(coord_values[1].num) / float(coord_values[1].den)  # Minutes
    s = float(coord_values[2].num) / float(coord_values[2].den)  # Seconds

    # Convert to decimal: degrees + minutes/60 + seconds/3600
    return d + (m / 60.0) + (s / 3600.0)


# =============================================================================
# ATTESTATION CREATION
# =============================================================================
#
# The main entry point for creating attestations. This function orchestrates:
#   1. Metadata preparation (EXIF extraction + user overrides)
#   2. Image hashing (cryptographic + perceptual)
#   3. Payload construction (canonical, deterministic format)
#   4. Digital signing (Ed25519)
#   5. Record assembly
#
# The result is an Attestation object containing both the image data and the
# signed record, ready for storage in the append-only log.
# =============================================================================


def create_attestation(
    image_data: bytes,
    private_key: Ed25519PrivateKey,
    metadata: dict[str, Any] | CaptureMetadata | None = None,
    timestamp: datetime | None = None,
    auto_exif: bool = True,
) -> Attestation:
    """
    Create a cryptographically signed attestation for an image.

    This is the primary function for creating image attestations. It binds
    an image to an identity at a specific point in time, with optional
    metadata providing additional context.

    The Attestation Process:
    -----------------------

        Input Image ──► Hash (SHA-256 + pHash + dHash)
                                    │
        EXIF Data ───► Extract ─────┤
                                    │
        User Metadata ──────────────┤
                                    ▼
                        ┌───────────────────┐
                        │  Canonical Payload │
                        │  (deterministic)   │
                        └─────────┬─────────┘
                                  │
        Private Key ──► Sign ─────┤
                                  ▼
                        ┌───────────────────┐
                        │ AttestationRecord │
                        └───────────────────┘

    Metadata Precedence:
    -------------------
    When auto_exif=True, metadata is merged with the following precedence:
    1. User-provided metadata (highest priority - always wins)
    2. EXIF-extracted metadata (fills in gaps)

    This allows users to correct wrong EXIF data (e.g., wrong camera time)
    while still benefiting from automatic extraction when available.

    Args:
        image_data:
            Raw image bytes. Supports any format PIL can read (JPEG, PNG,
            TIFF, WebP, etc.). The bytes are hashed directly for SHA-256,
            and decoded for perceptual hashing.

        private_key:
            Ed25519 private key for signing. This establishes WHO is making
            the attestation. The corresponding public key fingerprint is
            embedded in the record.

        metadata:
            Optional provenance metadata. Can be:
            - dict: Raw key-value pairs (flexible schema)
            - CaptureMetadata: Structured object with typed fields
            - None: No user metadata (may still extract EXIF)

            Common fields: location, caption, device, captured_at, tags

        timestamp:
            Override the attestation timestamp. Defaults to current UTC time.
            Note: This is ATTESTATION time, not CAPTURE time. Capture time
            should be in metadata (auto-extracted from EXIF or user-provided).

        auto_exif:
            If True (default), automatically extract EXIF metadata from the
            image and merge with user-provided metadata. Set to False to:
            - Skip EXIF extraction for speed
            - Avoid including potentially sensitive EXIF data
            - When EXIF is known to be incorrect/missing

    Returns:
        Attestation object containing:
        - image_data: Original image bytes (for storage/verification)
        - image_hashes: Computed hashes (SHA-256 + perceptual)
        - record: Signed AttestationRecord ready for the log

    Raises:
        ValueError: If image cannot be decoded for hashing
        TypeError: If private_key is not an Ed25519PrivateKey

    Example:
        >>> # Simple attestation with auto EXIF
        >>> attestation = create_attestation(image_bytes, private_key)

        >>> # With manual location override
        >>> attestation = create_attestation(
        ...     image_bytes,
        ...     private_key,
        ...     metadata={"location": {"lat": 50.45, "lon": 30.52, "name": "Kyiv"}},
        ... )

        >>> # Disable EXIF extraction
        >>> attestation = create_attestation(
        ...     image_bytes,
        ...     private_key,
        ...     auto_exif=False,
        ... )

    Security Notes:
        - The signature covers ALL fields (hashes, fingerprint, timestamp, metadata)
        - Changing any field invalidates the signature
        - Timestamp is attestation time, not necessarily capture time
        - Verify attestations using attest.verification module
    """
    # -------------------------------------------------------------------------
    # STEP 1: Establish attestation timestamp
    # Use provided timestamp or current UTC time
    # -------------------------------------------------------------------------
    timestamp = timestamp or datetime.now(timezone.utc)

    # -------------------------------------------------------------------------
    # STEP 2: Normalize metadata to dict format
    # Accept CaptureMetadata objects, dicts, or None
    # -------------------------------------------------------------------------
    if isinstance(metadata, CaptureMetadata):
        # Convert structured object to dict for JSON serialization
        metadata_dict = metadata.to_dict()
    elif metadata is None:
        metadata_dict = {}
    else:
        # Make a copy to avoid mutating caller's dict
        metadata_dict = dict(metadata)

    # -------------------------------------------------------------------------
    # STEP 3: Auto-extract and merge EXIF metadata (if enabled)
    # EXIF provides defaults; user metadata takes precedence
    # -------------------------------------------------------------------------
    if auto_exif:
        exif_data = extract_exif_metadata(image_data)
        if exif_data:
            # Start with empty merged dict, add EXIF fields not in user metadata
            merged: dict[str, Any] = {}

            # Convert EXIF objects to JSON-serializable dict format
            # Only include if not already provided by user
            if "captured_at" in exif_data and "captured_at" not in metadata_dict:
                merged["captured_at"] = exif_data["captured_at"].isoformat()

            if "location" in exif_data and "location" not in metadata_dict:
                merged["location"] = exif_data["location"].to_dict()

            if "device" in exif_data and "device" not in metadata_dict:
                merged["device"] = exif_data["device"].to_dict()

            if "width" in exif_data and "width" not in metadata_dict:
                merged["width"] = exif_data["width"]

            if "height" in exif_data and "height" not in metadata_dict:
                merged["height"] = exif_data["height"]

            # User metadata overwrites EXIF (higher precedence)
            merged.update(metadata_dict)
            metadata_dict = merged

    metadata = metadata_dict

    # -------------------------------------------------------------------------
    # STEP 4: Compute image hashes
    # SHA-256 for exact matching, pHash/dHash for perceptual matching
    # These survive social media compression, resizing, format conversion
    # -------------------------------------------------------------------------
    image_hashes = hash_image(image_data)

    # -------------------------------------------------------------------------
    # STEP 5: Get attestor identity from private key
    # The fingerprint is SHA-256(public_key)[:16] in hex (32 chars)
    # -------------------------------------------------------------------------
    identity = create_identity(private_key)

    # -------------------------------------------------------------------------
    # STEP 6: Build canonical signing payload
    # Deterministic byte string that will be signed
    # Must be reproducible for verification
    # -------------------------------------------------------------------------
    signing_payload = _build_signing_payload(
        image_hashes, identity.fingerprint, timestamp, metadata
    )

    # -------------------------------------------------------------------------
    # STEP 7: Sign the payload with Ed25519
    # Produces 64-byte signature
    # -------------------------------------------------------------------------
    signature = sign(private_key, signing_payload)

    # -------------------------------------------------------------------------
    # STEP 8: Assemble the attestation record
    # This is what gets stored in the append-only log
    # -------------------------------------------------------------------------
    record = AttestationRecord(
        image_hashes=image_hashes,
        signature=signature,
        attestor_fingerprint=identity.fingerprint,
        timestamp=timestamp,
        metadata=metadata,
    )

    # Return full Attestation with image data (for storage) and record
    return Attestation(
        image_data=image_data,
        image_hashes=image_hashes,
        record=record,
    )


def create_attestation_from_hashes(
    image_hashes: ImageHashes,
    private_key: Ed25519PrivateKey,
    metadata: dict[str, Any] | None = None,
    timestamp: datetime | None = None,
) -> AttestationRecord:
    """
    Create attestation record from pre-computed hashes (without image bytes).

    This is a lower-level function for scenarios where you have image hashes
    but not the original image data:

    Use Cases:
    ---------
    - **Distributed systems**: Hashes computed on one node, signed on another
    - **Batch processing**: Pre-compute hashes, sign later
    - **Re-attestation**: Create new attestation for known hashes
    - **Testing**: Create records without actual images

    Unlike create_attestation(), this function:
    - Does NOT compute hashes (uses provided ImageHashes)
    - Does NOT extract EXIF (no image bytes available)
    - Returns AttestationRecord directly (no Attestation wrapper)

    Args:
        image_hashes: Pre-computed ImageHashes object with sha256, phash, dhash
        private_key: Ed25519 private key for signing
        metadata: Optional metadata dict (no auto-extraction)
        timestamp: Override timestamp (defaults to now, UTC)

    Returns:
        AttestationRecord ready for storage (no image data attached)

    Example:
        >>> # Re-attest an image you've already hashed
        >>> from .hashing import hash_image
        >>> hashes = hash_image(image_bytes)
        >>> # ... later, on a different system ...
        >>> record = create_attestation_from_hashes(hashes, private_key)
    """
    timestamp = timestamp or datetime.now(timezone.utc)
    metadata = metadata or {}

    identity = create_identity(private_key)
    signing_payload = _build_signing_payload(
        image_hashes, identity.fingerprint, timestamp, metadata
    )
    signature = sign(private_key, signing_payload)

    return AttestationRecord(
        image_hashes=image_hashes,
        signature=signature,
        attestor_fingerprint=identity.fingerprint,
        timestamp=timestamp,
        metadata=metadata,
    )


# =============================================================================
# SIGNING PAYLOAD CONSTRUCTION
# =============================================================================
#
# The signing payload is the exact byte sequence that gets signed. It MUST be:
#   1. Deterministic - same inputs always produce same payload
#   2. Unambiguous - different inputs never produce same payload
#   3. Complete - covers all attested data
#
# We achieve this with:
#   - Newline-separated fields (unambiguous delimiter)
#   - Sorted JSON keys (deterministic object serialization)
#   - No whitespace in JSON (canonical form)
#   - UTF-8 encoding (explicit byte representation)
# =============================================================================


def _build_signing_payload(
    image_hashes: ImageHashes,
    attestor_fingerprint: str,
    timestamp: datetime,
    metadata: dict[str, Any],
) -> bytes:
    """
    Build the canonical byte string for Ed25519 signing.

    This function constructs a deterministic representation of all attested
    data. The same inputs MUST always produce the same output bytes, as this
    is required for signature verification.

    Payload Format:
    --------------
    Line 1: SHA-256 hash (64 hex characters)
    Line 2: pHash (perceptual hash, typically 16 hex chars)
    Line 3: dHash (difference hash, typically 16 hex chars)
    Line 4: Attestor fingerprint (32 hex characters)
    Line 5: Timestamp (ISO 8601 format with timezone)
    Line 6: Metadata (JSON, sorted keys, no whitespace)

    Example payload:
        a1b2c3d4...  (SHA-256, 64 chars)
        f8e7d6c5...  (pHash, 16 chars)
        1a2b3c4d...  (dHash, 16 chars)
        9f8e7d6c...  (fingerprint, 32 chars)
        2024-01-15T10:30:00+00:00
        {"caption":"Test","location":{"lat":50.45,"lon":30.52}}

    Canonicalization Rules:
    ----------------------
    - JSON keys are sorted alphabetically (Python's sort_keys=True)
    - No whitespace in JSON (separators=(",", ":"))
    - Timestamp includes timezone (isoformat())
    - All lines joined with single newline (no trailing newline)
    - Final encoding is UTF-8

    Args:
        image_hashes: ImageHashes object with sha256, phash, dhash
        attestor_fingerprint: Hex string identifying the attestor
        timestamp: Datetime object (should have timezone info)
        metadata: Dict to serialize as JSON

    Returns:
        UTF-8 encoded bytes ready for signing

    Security Notes:
        - Changing any field changes the payload (and invalidates signature)
        - The format is designed to be unambiguous (no field can "bleed" into another)
        - Verification must use IDENTICAL canonicalization
    """
    import json

    # Canonical JSON: sorted keys, no whitespace (compact, deterministic)
    metadata_json = json.dumps(metadata, sort_keys=True, separators=(",", ":"))

    # Join all fields with newlines (unambiguous separator)
    payload = "\n".join([
        image_hashes.sha256,      # Line 1: Cryptographic hash
        image_hashes.phash,       # Line 2: Perceptual hash (DCT-based)
        image_hashes.dhash,       # Line 3: Difference hash
        attestor_fingerprint,     # Line 4: Who is attesting
        timestamp.isoformat(),    # Line 5: When (ISO 8601)
        metadata_json,            # Line 6: Additional context
    ])

    # Encode as UTF-8 bytes for signing
    return payload.encode("utf-8")