#!/usr/bin/env python3
"""
Enhanced attendance estimation incorporating iOS rotation analysis
"""

import pandas as pd
import numpy as np
import os
import json

# Load the saved data
RESULTS_DIR = "./analysis_results"

# Load session data
session_file = os.path.join(RESULTS_DIR, "device_sessions.csv")
if not os.path.exists(session_file):
    print(f"Error: {session_file} not found. Run the main analysis first.")
    exit(1)

df = pd.read_csv(session_file)
df['firstSeenTime'] = pd.to_datetime(df['firstSeenTime'])
df['lastSeenTime'] = pd.to_datetime(df['lastSeenTime'])

# Load iOS rotation analysis if available
ios_rotation_file = os.path.join(RESULTS_DIR, "ios_rotation_analysis_20250807.csv")
rotation_df = None
if os.path.exists(ios_rotation_file):
    rotation_df = pd.read_csv(ios_rotation_file)
    print(f"Loaded iOS rotation analysis: {len(rotation_df)} time windows")

# Classify addresses
def classify_address(address):
    """Classify MAC address type"""
    if len(address) != 12:
        return "Invalid"
    
    try:
        msb = int(address[:2], 16)
        is_local = (msb & 0x02) != 0
        is_multicast = (msb & 0x01) != 0
        
        if is_local and not is_multicast:
            type_bits = (msb >> 6) & 0x03
            if type_bits == 0b11:
                return "Random Static"
            elif type_bits == 0b01:
                return "Random Private Resolvable"
            elif type_bits == 0b00:
                return "Random Private Non-Resolvable"
            else:
                return "Reserved"
        else:
            return "Public"
    except:
        return "Invalid"

df['address_type'] = df['address'].apply(classify_address)

print("\n=== iOS ROTATION ADJUSTED ATTENDANCE ESTIMATION ===")

# Calculate total dwell time per device
device_dwell = df.groupby(['date', 'address', 'address_type'])['dwellTime'].sum().reset_index()
device_dwell.columns = ['date', 'address', 'address_type', 'total_dwell_time']

# 1. Baseline counts
print("\n1. BASELINE COUNTS (no adjustment):")
total_devices = len(device_dwell)
dwell_10min = len(device_dwell[device_dwell['total_dwell_time'] >= 10])
print(f"  Total unique MACs: {total_devices:,}")
print(f"  MACs with ≥10 min dwell: {dwell_10min:,}")

# 2. iOS rotation impact analysis
if rotation_df is not None:
    print("\n2. iOS ROTATION IMPACT:")
    avg_addresses = rotation_df['unique_addresses'].mean()
    avg_rotations = rotation_df['potential_rotations'].mean()
    total_potential_rotations = rotation_df['potential_rotations'].sum()
    
    print(f"  Average addresses per 15-min window: {avg_addresses:.1f}")
    print(f"  Average potential rotations per window: {avg_rotations:.1f}")
    print(f"  Total potential rotations detected: {total_potential_rotations:,}")
    
    # Estimate rotation factor
    # If we see N potential rotations, it suggests devices are appearing as multiple MACs
    rotation_factor = 1 + (total_potential_rotations / total_devices)
    print(f"  Estimated rotation factor: {rotation_factor:.2f}x")

# 3. Address type specific adjustments
print("\n3. ADDRESS TYPE SPECIFIC ADJUSTMENTS:")

# Known behaviors by type:
# - Random Static: Generally stable (factor: 1.0)
# - Public: Mostly stable, some rotation (factor: 1.2)
# - Random Private Resolvable: Heavy rotation, especially iOS (factor: 3-5)
# - Random Private Non-Resolvable: Very heavy rotation (factor: 5-10)

adjustment_factors = {
    'Random Static': 1.0,  # Stable
    'Public': 1.2,  # Some rotation
    'Random Private Resolvable': 4.0,  # iOS devices, heavy rotation
    'Random Private Non-Resolvable': 8.0,  # Extreme rotation
    'Reserved': 1.0,
    'Invalid': 1.0
}

# Apply adjustments
adjusted_counts = {}
for addr_type, factor in adjustment_factors.items():
    type_count = len(device_dwell[
        (device_dwell['address_type'] == addr_type) & 
        (device_dwell['total_dwell_time'] >= 10)
    ])
    adjusted = int(type_count / factor)
    adjusted_counts[addr_type] = {
        'raw': type_count,
        'adjusted': adjusted,
        'factor': factor
    }
    print(f"  {addr_type}:")
    print(f"    Raw count: {type_count:,}")
    print(f"    Adjustment factor: {factor}x")
    print(f"    Adjusted count: {adjusted:,}")

# 4. Calculate total adjusted attendance
print("\n4. ADJUSTED ATTENDANCE ESTIMATES:")

# Method A: Simple sum of adjusted counts
method_a_total = sum(counts['adjusted'] for counts in adjusted_counts.values())
print(f"\n  Method A (Type-specific adjustment):")
print(f"    Total adjusted attendance: {method_a_total:,}")

# Method B: Weighted by dwell time
# Longer dwell times suggest less rotation
print(f"\n  Method B (Dwell-time weighted):")
dwell_weighted_total = 0
for addr_type in device_dwell['address_type'].unique():
    type_data = device_dwell[
        (device_dwell['address_type'] == addr_type) & 
        (device_dwell['total_dwell_time'] >= 10)
    ]
    if len(type_data) > 0:
        avg_dwell = type_data['total_dwell_time'].mean()
        # Higher dwell time = less likely to be rotating
        dwell_factor = min(avg_dwell / 30, 1.0)  # Cap at 30 min
        rotation_reduction = 1 + (adjustment_factors.get(addr_type, 1.0) - 1) * (1 - dwell_factor)
        adjusted = int(len(type_data) / rotation_reduction)
        dwell_weighted_total += adjusted
        print(f"    {addr_type}: {adjusted:,} (avg dwell: {avg_dwell:.1f} min)")

print(f"    Total adjusted attendance: {dwell_weighted_total:,}")

# Method C: Cluster-informed adjustment
# Use the clustering results to validate
cluster_file = os.path.join(RESULTS_DIR, "device_clusters.csv")
if os.path.exists(cluster_file):
    cluster_df = pd.read_csv(cluster_file)
    num_clusters = cluster_df['cluster_id'].nunique()
    clustered_devices = len(cluster_df)
    unclustered = total_devices - clustered_devices
    
    print(f"\n  Method C (Cluster-based validation):")
    print(f"    Number of clusters: {num_clusters:,}")
    print(f"    Clustered devices: {clustered_devices:,}")
    print(f"    Unclustered devices: {unclustered:,}")
    print(f"    Cluster-based estimate: {num_clusters + unclustered:,}")

# 5. Final recommendations
print("\n5. ATTENDANCE ESTIMATE RECOMMENDATIONS:")
print(f"\n  Conservative (Random Static only): 858")
print(f"  Type-adjusted estimate: {method_a_total:,}")
print(f"  Dwell-weighted estimate: {dwell_weighted_total:,}")
print(f"  Best estimate range: {min(method_a_total, dwell_weighted_total):,} - {max(method_a_total, dwell_weighted_total):,}")

# 6. Confidence analysis
print("\n6. CONFIDENCE ANALYSIS:")
if rotation_df is not None:
    # Check consistency across time windows
    window_variance = rotation_df['unique_addresses'].std() / rotation_df['unique_addresses'].mean()
    print(f"  Address count variance across windows: {window_variance:.2%}")
    
    if window_variance < 0.2:
        print(f"  ✓ Low variance suggests stable crowd")
    else:
        print(f"  ⚠ High variance suggests dynamic crowd or detection issues")

# Compare methods
estimates = [858, method_a_total, dwell_weighted_total]
if 'num_clusters' in locals():
    estimates.append(num_clusters + unclustered)

estimate_variance = np.std(estimates) / np.mean(estimates)
print(f"\n  Estimate variance: {estimate_variance:.2%}")
if estimate_variance < 0.3:
    print(f"  ✓ Good agreement between methods")
else:
    print(f"  ⚠ High variance between methods - consider manual review")