Merge stegasoo (v4.3.0, steganography) and verisoo (v0.1.0, attestation) as subpackages under soosef.stegasoo and soosef.verisoo. This eliminates cross-repo coordination and enables atomic changes across the full stack. - Copy stegasoo (34 modules) and verisoo (15 modules) into src/soosef/ - Convert all verisoo absolute imports to relative imports - Rewire ~50 import sites across soosef code (cli, web, keystore, tests) - Replace stegasoo/verisoo pip deps with inlined code + pip extras (stego-dct, stego-audio, attest, web, api, cli, fieldkit, all, dev) - Add _availability.py for runtime feature detection - Add unified FastAPI mount point at soosef.api - Copy and adapt tests from both repos (155 pass, 1 skip) - Drop standalone CLI/web frontends; keep FastAPI as optional modules - Both source repos tagged pre-monorepo-consolidation on GitHub Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
282 lines
8.9 KiB
Python
282 lines
8.9 KiB
Python
"""
|
|
Steganalysis Self-Check Module (v4.4.0)
|
|
|
|
Statistical analysis to estimate detectability risk of stego images.
|
|
Runs chi-square and RS (Regular-Singular) analysis on pixel data
|
|
to assess how visible the embedding is to an attacker.
|
|
|
|
Currently LSB-only. DCT steganalysis (calibration attack) deferred.
|
|
|
|
Usage::
|
|
|
|
from stegasoo.steganalysis import check_image
|
|
|
|
result = check_image(image_data)
|
|
print(result["risk"]) # "low", "medium", or "high"
|
|
print(result["chi_square"]) # per-channel chi-square p-values
|
|
print(result["rs"]) # per-channel RS embedding estimates
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import io
|
|
from dataclasses import dataclass, field
|
|
|
|
import numpy as np
|
|
from PIL import Image
|
|
|
|
from .constants import (
|
|
STEGANALYSIS_CHI_SUSPICIOUS_THRESHOLD,
|
|
STEGANALYSIS_RS_HIGH_THRESHOLD,
|
|
STEGANALYSIS_RS_MEDIUM_THRESHOLD,
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class SteganalysisResult:
|
|
"""Result of steganalysis on an image."""
|
|
|
|
risk: str # "low", "medium", or "high"
|
|
chi_square: dict = field(default_factory=dict) # per-channel p-values
|
|
rs: dict = field(default_factory=dict) # per-channel embedding estimates
|
|
width: int = 0
|
|
height: int = 0
|
|
channels: int = 0
|
|
mode: str = "lsb"
|
|
|
|
|
|
def chi_square_analysis(channel_data: np.ndarray) -> float:
|
|
"""Chi-square test on LSB distribution of a single channel.
|
|
|
|
Groups pixel values into pairs (2i, 2i+1) — so-called "pairs of values"
|
|
(PoVs). In a clean image, each pair has a natural frequency ratio.
|
|
LSB embedding with random data forces each pair toward equal frequency.
|
|
|
|
The test measures H0: "pairs are equalized" (consistent with embedding).
|
|
|
|
Args:
|
|
channel_data: Flattened 1-D array of pixel values (uint8).
|
|
|
|
Returns:
|
|
p-value from chi-square test.
|
|
HIGH p-value (close to 1.0) → pairs are equalized → suspicious.
|
|
LOW p-value (close to 0.0) → pairs are not equalized → less suspicious.
|
|
"""
|
|
from scipy.stats import chi2
|
|
|
|
# Count occurrences of each value 0-255
|
|
histogram = np.bincount(channel_data.ravel(), minlength=256)
|
|
|
|
# Group into 128 pairs: (0,1), (2,3), ..., (254,255)
|
|
chi_sq = 0.0
|
|
degrees_of_freedom = 0
|
|
|
|
for i in range(0, 256, 2):
|
|
observed_even = histogram[i]
|
|
observed_odd = histogram[i + 1]
|
|
total = observed_even + observed_odd
|
|
|
|
if total == 0:
|
|
continue
|
|
|
|
expected = total / 2.0
|
|
chi_sq += (observed_even - expected) ** 2 / expected
|
|
chi_sq += (observed_odd - expected) ** 2 / expected
|
|
degrees_of_freedom += 1
|
|
|
|
if degrees_of_freedom == 0:
|
|
return 1.0 # No data to analyze
|
|
|
|
# p-value: probability of observing this chi-square value by chance
|
|
# Low p-value = LSBs are suspiciously uniform = likely embedded
|
|
p_value = 1.0 - chi2.cdf(chi_sq, degrees_of_freedom)
|
|
return float(p_value)
|
|
|
|
|
|
def rs_analysis(channel_data: np.ndarray, block_size: int = 8) -> float:
|
|
"""Regular-Singular groups analysis on a single channel.
|
|
|
|
Divides the image channel into groups of `block_size` pixels and measures
|
|
the "smoothness" (variation) of each group. Applying a flipping function
|
|
F1 (flip LSB) and F-1 (flip LSB of value-1) produces Regular (smoother)
|
|
and Singular (rougher) groups.
|
|
|
|
In a clean image: R_m ≈ R_{-m} and S_m ≈ S_{-m}.
|
|
LSB embedding causes R_m and S_{-m} to converge while S_m and R_{-m}
|
|
diverge, allowing estimation of the embedding rate.
|
|
|
|
Args:
|
|
channel_data: Flattened 1-D array of pixel values (uint8).
|
|
block_size: Number of pixels per group (default 8).
|
|
|
|
Returns:
|
|
Estimated embedding rate (0.0 = clean, 1.0 = fully embedded).
|
|
Values > 0.5 strongly indicate LSB embedding.
|
|
"""
|
|
data = channel_data.ravel().astype(np.int16)
|
|
n = len(data)
|
|
# Trim to multiple of block_size
|
|
n_blocks = n // block_size
|
|
if n_blocks < 10:
|
|
return 0.0 # Not enough data
|
|
|
|
data = data[: n_blocks * block_size].reshape(n_blocks, block_size)
|
|
|
|
def variation(block: np.ndarray) -> float:
|
|
"""Sum of absolute differences between adjacent pixels."""
|
|
return float(np.sum(np.abs(np.diff(block))))
|
|
|
|
def flip_positive(block: np.ndarray) -> np.ndarray:
|
|
"""F1: flip LSB (0↔1, 2↔3, 4↔5, ...)."""
|
|
return block ^ 1
|
|
|
|
def flip_negative(block: np.ndarray) -> np.ndarray:
|
|
"""F-1: flip LSB of (value - 1), i.e. -1↔0, 1↔2, 3↔4, ..."""
|
|
result = block.copy()
|
|
even_mask = (block % 2) == 0
|
|
result[even_mask] -= 1
|
|
result[~even_mask] += 1
|
|
return result
|
|
|
|
r_m = s_m = r_neg = s_neg = 0
|
|
|
|
for i in range(n_blocks):
|
|
block = data[i]
|
|
v_orig = variation(block)
|
|
|
|
v_f1 = variation(flip_positive(block))
|
|
if v_f1 > v_orig:
|
|
r_m += 1
|
|
elif v_f1 < v_orig:
|
|
s_m += 1
|
|
|
|
v_fn1 = variation(flip_negative(block))
|
|
if v_fn1 > v_orig:
|
|
r_neg += 1
|
|
elif v_fn1 < v_orig:
|
|
s_neg += 1
|
|
|
|
# Estimate embedding rate using the RS quadratic formula
|
|
# d0 = R_m - S_m, d1 = R_{-m} - S_{-m}
|
|
# The embedding rate p satisfies: d(p/2) = d0, d(1 - p/2) = d1
|
|
# Simplified estimator: p ≈ (R_m - S_m) / (R_{-m} - S_{-m}) divergence
|
|
d0 = r_m - s_m
|
|
d1 = r_neg - s_neg
|
|
|
|
if n_blocks == 0:
|
|
return 0.0
|
|
|
|
# Use the simplified dual-statistic estimator
|
|
# In clean images: d0 ≈ d1 (both positive)
|
|
# In embedded images: d0 → 0 while d1 stays positive
|
|
if d1 == 0:
|
|
# Can't estimate — likely very embedded or degenerate
|
|
return 0.5 if d0 == 0 else 0.0
|
|
|
|
# Ratio-based estimate: how much has d0 dropped relative to d1
|
|
ratio = d0 / d1
|
|
if ratio >= 1.0:
|
|
return 0.0 # d0 ≥ d1 means no evidence of embedding
|
|
if ratio <= 0.0:
|
|
return 1.0 # d0 collapsed or inverted
|
|
|
|
# Linear interpolation: ratio=1 → 0% embedded, ratio=0 → 100% embedded
|
|
estimate = 1.0 - ratio
|
|
return float(np.clip(estimate, 0.0, 1.0))
|
|
|
|
|
|
def assess_risk(chi_p_values: dict[str, float], rs_estimates: dict[str, float]) -> str:
|
|
"""Map analysis results to a risk level.
|
|
|
|
RS analysis is the primary metric (reliable for both sequential and
|
|
random-order embedding). Chi-square is supplementary — high p-values
|
|
indicate equalized PoV pairs, which is suspicious for random LSB embedding.
|
|
|
|
Args:
|
|
chi_p_values: Per-channel chi-square p-values (high = suspicious).
|
|
rs_estimates: Per-channel RS embedding rate estimates (high = suspicious).
|
|
|
|
Returns:
|
|
"low", "medium", or "high" detectability risk.
|
|
"""
|
|
if not chi_p_values and not rs_estimates:
|
|
return "low"
|
|
|
|
# RS is the primary indicator: any channel with high embedding estimate
|
|
max_rs = max(rs_estimates.values()) if rs_estimates else 0.0
|
|
|
|
# Chi-square: high p-value means pairs are equalized (suspicious)
|
|
max_chi_p = max(chi_p_values.values()) if chi_p_values else 0.0
|
|
chi_suspicious = max_chi_p > STEGANALYSIS_CHI_SUSPICIOUS_THRESHOLD
|
|
|
|
# High risk: RS strongly indicates embedding
|
|
if max_rs > STEGANALYSIS_RS_HIGH_THRESHOLD:
|
|
return "high"
|
|
|
|
# Medium risk: moderate RS signal, or RS + chi-square both flagging
|
|
if max_rs > STEGANALYSIS_RS_MEDIUM_THRESHOLD:
|
|
return "medium"
|
|
if chi_suspicious and max_rs > 0.05:
|
|
return "medium"
|
|
|
|
return "low"
|
|
|
|
|
|
def check_image(image_data: bytes, mode: str = "lsb") -> dict:
|
|
"""Run steganalysis on an image and return detectability assessment.
|
|
|
|
Args:
|
|
image_data: Raw image bytes (PNG, BMP, etc.).
|
|
mode: Analysis mode — currently only "lsb" is supported.
|
|
|
|
Returns:
|
|
Dict with keys: risk, chi_square, rs, width, height, channels, mode.
|
|
"""
|
|
if mode not in ("lsb", "auto"):
|
|
raise ValueError(f"Unsupported steganalysis mode: {mode}. Use 'lsb' or 'auto'.")
|
|
|
|
img = Image.open(io.BytesIO(image_data))
|
|
if img.mode not in ("RGB", "RGBA", "L"):
|
|
img = img.convert("RGB")
|
|
|
|
width, height = img.size
|
|
pixels = np.array(img)
|
|
img.close()
|
|
|
|
channel_names = ["R", "G", "B"] if pixels.ndim == 3 else ["L"]
|
|
if pixels.ndim == 2:
|
|
pixels = pixels[:, :, np.newaxis]
|
|
|
|
num_channels = min(pixels.shape[2], 3) # Skip alpha
|
|
|
|
chi_p_values = {}
|
|
rs_estimates = {}
|
|
|
|
for i in range(num_channels):
|
|
name = channel_names[i]
|
|
channel = pixels[:, :, i].ravel()
|
|
chi_p_values[name] = chi_square_analysis(channel)
|
|
rs_estimates[name] = rs_analysis(channel)
|
|
|
|
risk = assess_risk(chi_p_values, rs_estimates)
|
|
|
|
result = SteganalysisResult(
|
|
risk=risk,
|
|
chi_square=chi_p_values,
|
|
rs=rs_estimates,
|
|
width=width,
|
|
height=height,
|
|
channels=num_channels,
|
|
mode=mode,
|
|
)
|
|
|
|
return {
|
|
"risk": result.risk,
|
|
"chi_square": result.chi_square,
|
|
"rs": result.rs,
|
|
"width": result.width,
|
|
"height": result.height,
|
|
"channels": result.channels,
|
|
"mode": result.mode,
|
|
}
|