Add extract-then-strip EXIF pipeline for attestation intake
Some checks failed
CI / lint (push) Failing after 53s
CI / typecheck (push) Failing after 30s

Resolves the tension between steganography (strip everything to
protect sources) and attestation (preserve evidence of provenance):

- New soosef.metadata module with extract_and_classify() and
  extract_strip_pipeline() — classifies EXIF fields as evidentiary
  (GPS, timestamp — valuable for proving provenance) vs dangerous
  (device serial, firmware — could identify the source)
- Drop box now uses extract-then-strip: attests ORIGINAL bytes (hash
  matches what source submitted), extracts evidentiary EXIF into
  attestation metadata, strips dangerous fields, stores clean copy
- Attest route gains strip_device option: when enabled, includes
  GPS/timestamp in attestation but excludes device serial/firmware
- Stego encode unchanged: still strips all metadata from carriers
  (correct for steganography threat model)

The key insight: for stego, the carrier is a vessel (strip everything).
For attestation, EXIF is the evidence (extract, classify, preserve
selectively). Both hashes (original + stripped) are recorded so the
relationship between raw submission and stored copy is provable.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Aaron D. Lee
2026-04-01 20:57:36 -04:00
parent 9431033c72
commit 171e51643c
3 changed files with 202 additions and 22 deletions

View File

@@ -141,6 +141,24 @@ def attest():
metadata["investigation"] = investigation
auto_exif = request.form.get("auto_exif", "on") == "on"
strip_device = request.form.get("strip_device", "on") == "on"
# Extract-then-classify: get evidentiary metadata before attestation
# so user can control what's included
if auto_exif and strip_device:
from soosef.metadata import extract_and_classify
extraction = extract_and_classify(image_data)
# Merge evidentiary fields (GPS, timestamp) but exclude
# dangerous device fields (serial, firmware version)
for key, value in extraction.evidentiary.items():
if key not in metadata: # User metadata takes precedence
if hasattr(value, "isoformat"):
metadata[f"exif_{key}"] = value.isoformat()
elif isinstance(value, dict):
metadata[f"exif_{key}"] = value
else:
metadata[f"exif_{key}"] = str(value)
# Create the attestation
from soosef.verisoo.attestation import create_attestation
@@ -149,7 +167,7 @@ def attest():
image_data=image_data,
private_key=private_key,
metadata=metadata if metadata else None,
auto_exif=auto_exif,
auto_exif=auto_exif and not strip_device, # Full EXIF only if not stripping device
)
# Store in the append-only log

View File

@@ -120,42 +120,55 @@ def upload(token):
if token_data["used"] >= token_data["max_files"]:
break
file_data = f.read()
if not file_data:
raw_data = f.read()
if not raw_data:
continue
# Strip EXIF metadata
try:
import io
# Extract-then-strip pipeline:
# 1. Extract EXIF into attestation metadata (evidentiary fields)
# 2. Attest the ORIGINAL bytes (hash matches what source submitted)
# 3. Strip metadata from the stored copy (protect source device info)
from soosef.metadata import extract_strip_pipeline
from PIL import Image
extraction, stripped_data = extract_strip_pipeline(raw_data)
img = Image.open(io.BytesIO(file_data))
clean = io.BytesIO()
img.save(clean, format=img.format or "PNG")
file_data = clean.getvalue()
except Exception:
pass # Not an image, or Pillow can't handle it — keep as-is
# SHA-256 of what the source actually submitted
sha256 = extraction.original_sha256
# Compute SHA-256
sha256 = hashlib.sha256(file_data).hexdigest()
# Save file
# Save the stripped copy for display/storage (no device fingerprint on disk)
dest = _TOKEN_DIR / f"{sha256[:16]}_{f.filename}"
dest.write_bytes(file_data)
dest.write_bytes(stripped_data)
# Auto-attest
chain_index = None
# Auto-attest the ORIGINAL bytes so the attestation hash matches
# what the source submitted. Evidentiary EXIF (GPS, timestamp)
# is preserved in the attestation metadata; dangerous fields
# (device serial) are excluded.
try:
from soosef.verisoo.attestation import create_attestation
from soosef.verisoo.storage import LocalStorage
from blueprints.attest import _get_private_key, _get_storage
attest_metadata = {
"source": "dropbox",
"label": token_data["label"],
"stripped_sha256": extraction.stripped_sha256,
}
# Include evidentiary EXIF in attestation (GPS, timestamp)
for key, value in extraction.evidentiary.items():
if hasattr(value, "isoformat"):
attest_metadata[key] = value.isoformat()
elif hasattr(value, "__dataclass_fields__"):
from dataclasses import asdict
attest_metadata[key] = asdict(value)
elif isinstance(value, dict):
attest_metadata[key] = value
else:
attest_metadata[key] = str(value)
private_key = _get_private_key()
if private_key:
attestation = create_attestation(
file_data, private_key, metadata={"source": "dropbox", "label": token_data["label"]}
raw_data, private_key, metadata=attest_metadata
)
storage = _get_storage()
storage.append_record(attestation.record)