Add extract-then-strip EXIF pipeline for attestation intake
Resolves the tension between steganography (strip everything to protect sources) and attestation (preserve evidence of provenance): - New soosef.metadata module with extract_and_classify() and extract_strip_pipeline() — classifies EXIF fields as evidentiary (GPS, timestamp — valuable for proving provenance) vs dangerous (device serial, firmware — could identify the source) - Drop box now uses extract-then-strip: attests ORIGINAL bytes (hash matches what source submitted), extracts evidentiary EXIF into attestation metadata, strips dangerous fields, stores clean copy - Attest route gains strip_device option: when enabled, includes GPS/timestamp in attestation but excludes device serial/firmware - Stego encode unchanged: still strips all metadata from carriers (correct for steganography threat model) The key insight: for stego, the carrier is a vessel (strip everything). For attestation, EXIF is the evidence (extract, classify, preserve selectively). Both hashes (original + stripped) are recorded so the relationship between raw submission and stored copy is provable. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -141,6 +141,24 @@ def attest():
|
||||
metadata["investigation"] = investigation
|
||||
|
||||
auto_exif = request.form.get("auto_exif", "on") == "on"
|
||||
strip_device = request.form.get("strip_device", "on") == "on"
|
||||
|
||||
# Extract-then-classify: get evidentiary metadata before attestation
|
||||
# so user can control what's included
|
||||
if auto_exif and strip_device:
|
||||
from soosef.metadata import extract_and_classify
|
||||
|
||||
extraction = extract_and_classify(image_data)
|
||||
# Merge evidentiary fields (GPS, timestamp) but exclude
|
||||
# dangerous device fields (serial, firmware version)
|
||||
for key, value in extraction.evidentiary.items():
|
||||
if key not in metadata: # User metadata takes precedence
|
||||
if hasattr(value, "isoformat"):
|
||||
metadata[f"exif_{key}"] = value.isoformat()
|
||||
elif isinstance(value, dict):
|
||||
metadata[f"exif_{key}"] = value
|
||||
else:
|
||||
metadata[f"exif_{key}"] = str(value)
|
||||
|
||||
# Create the attestation
|
||||
from soosef.verisoo.attestation import create_attestation
|
||||
@@ -149,7 +167,7 @@ def attest():
|
||||
image_data=image_data,
|
||||
private_key=private_key,
|
||||
metadata=metadata if metadata else None,
|
||||
auto_exif=auto_exif,
|
||||
auto_exif=auto_exif and not strip_device, # Full EXIF only if not stripping device
|
||||
)
|
||||
|
||||
# Store in the append-only log
|
||||
|
||||
@@ -120,42 +120,55 @@ def upload(token):
|
||||
if token_data["used"] >= token_data["max_files"]:
|
||||
break
|
||||
|
||||
file_data = f.read()
|
||||
if not file_data:
|
||||
raw_data = f.read()
|
||||
if not raw_data:
|
||||
continue
|
||||
|
||||
# Strip EXIF metadata
|
||||
try:
|
||||
import io
|
||||
# Extract-then-strip pipeline:
|
||||
# 1. Extract EXIF into attestation metadata (evidentiary fields)
|
||||
# 2. Attest the ORIGINAL bytes (hash matches what source submitted)
|
||||
# 3. Strip metadata from the stored copy (protect source device info)
|
||||
from soosef.metadata import extract_strip_pipeline
|
||||
|
||||
from PIL import Image
|
||||
extraction, stripped_data = extract_strip_pipeline(raw_data)
|
||||
|
||||
img = Image.open(io.BytesIO(file_data))
|
||||
clean = io.BytesIO()
|
||||
img.save(clean, format=img.format or "PNG")
|
||||
file_data = clean.getvalue()
|
||||
except Exception:
|
||||
pass # Not an image, or Pillow can't handle it — keep as-is
|
||||
# SHA-256 of what the source actually submitted
|
||||
sha256 = extraction.original_sha256
|
||||
|
||||
# Compute SHA-256
|
||||
sha256 = hashlib.sha256(file_data).hexdigest()
|
||||
|
||||
# Save file
|
||||
# Save the stripped copy for display/storage (no device fingerprint on disk)
|
||||
dest = _TOKEN_DIR / f"{sha256[:16]}_{f.filename}"
|
||||
dest.write_bytes(file_data)
|
||||
dest.write_bytes(stripped_data)
|
||||
|
||||
# Auto-attest
|
||||
chain_index = None
|
||||
# Auto-attest the ORIGINAL bytes so the attestation hash matches
|
||||
# what the source submitted. Evidentiary EXIF (GPS, timestamp)
|
||||
# is preserved in the attestation metadata; dangerous fields
|
||||
# (device serial) are excluded.
|
||||
try:
|
||||
from soosef.verisoo.attestation import create_attestation
|
||||
from soosef.verisoo.storage import LocalStorage
|
||||
|
||||
from blueprints.attest import _get_private_key, _get_storage
|
||||
|
||||
attest_metadata = {
|
||||
"source": "dropbox",
|
||||
"label": token_data["label"],
|
||||
"stripped_sha256": extraction.stripped_sha256,
|
||||
}
|
||||
# Include evidentiary EXIF in attestation (GPS, timestamp)
|
||||
for key, value in extraction.evidentiary.items():
|
||||
if hasattr(value, "isoformat"):
|
||||
attest_metadata[key] = value.isoformat()
|
||||
elif hasattr(value, "__dataclass_fields__"):
|
||||
from dataclasses import asdict
|
||||
attest_metadata[key] = asdict(value)
|
||||
elif isinstance(value, dict):
|
||||
attest_metadata[key] = value
|
||||
else:
|
||||
attest_metadata[key] = str(value)
|
||||
|
||||
private_key = _get_private_key()
|
||||
if private_key:
|
||||
attestation = create_attestation(
|
||||
file_data, private_key, metadata={"source": "dropbox", "label": token_data["label"]}
|
||||
raw_data, private_key, metadata=attest_metadata
|
||||
)
|
||||
storage = _get_storage()
|
||||
storage.append_record(attestation.record)
|
||||
|
||||
Reference in New Issue
Block a user