#!/usr/bin/env python3
"""
Scanner System Diagnostic
Helps identify what's wrong with the BLE scanning setup
"""

import json
import os
from collections import Counter, defaultdict
from datetime import datetime

DATA_DIR = "/var/www/html/CrowdFlow/Picklecon"

def check_scanner_issues():
    """Diagnose scanner configuration issues"""
    
    print("=== SCANNER SYSTEM DIAGNOSTIC ===\n")
    
    # Load sample data
    sample_file = os.path.join(DATA_DIR, "Picklecon_combineddwell_flat_20250809.json")
    if not os.path.exists(sample_file):
        print("ERROR: Cannot find sample file")
        return
    
    with open(sample_file, 'r') as f:
        data = json.load(f)
    
    print(f"Analyzing {len(data):,} records from Saturday (peak day)\n")
    
    # 1. Check for hash-like address patterns
    print("1. ADDRESS PATTERN ANALYSIS")
    print("-" * 40)
    
    addresses = [r.get('address', '').replace(':', '').upper() for r in data]
    addresses = [a for a in addresses if len(a) == 12]
    
    # Check byte distribution
    byte_entropy = []
    for byte_pos in range(6):
        byte_values = [int(addr[byte_pos*2:(byte_pos+1)*2], 16) for addr in addresses[:10000]]
        unique_values = len(set(byte_values))
        entropy = unique_values / 256
        byte_entropy.append(entropy)
        print(f"  Byte {byte_pos}: {unique_values}/256 unique values ({entropy:.1%} entropy)")
    
    avg_entropy = sum(byte_entropy) / len(byte_entropy)
    print(f"\n  Average entropy: {avg_entropy:.1%}")
    if avg_entropy > 0.9:
        print("  ⚠️  HIGH ENTROPY: Addresses appear hashed/randomized!")
    
    # 2. Check OUI distribution
    print("\n\n2. OUI (MANUFACTURER) ANALYSIS")
    print("-" * 40)
    
    ouis = [addr[:6] for addr in addresses]
    oui_counts = Counter(ouis)
    
    print(f"  Total addresses: {len(addresses):,}")
    print(f"  Unique OUIs: {len(oui_counts):,}")
    print(f"  OUI diversity: {len(oui_counts)/len(addresses)*100:.1f}%")
    
    # Normal should be <10% diversity
    if len(oui_counts) / len(addresses) > 0.3:
        print("  ⚠️  ABNORMAL: Real devices cluster around common manufacturers")
    
    # Show top OUIs (should see Apple, Samsung, etc.)
    print("\n  Top 10 OUIs:")
    for oui, count in oui_counts.most_common(10):
        pct = count / len(addresses) * 100
        print(f"    {oui}: {count:,} ({pct:.1f}%)")
    
    # 3. Check dwell time distribution
    print("\n\n3. DWELL TIME DISTRIBUTION")
    print("-" * 40)
    
    dwell_times = [r.get('dwellTime', 0) for r in data if r.get('dwellTime', 0) > 0]
    dwell_buckets = {
        '0-1 min': 0,
        '1-5 min': 0,
        '5-15 min': 0,
        '15-30 min': 0,
        '30-60 min': 0,
        '1-2 hours': 0,
        '2-4 hours': 0,
        '4-8 hours': 0,
        '8+ hours': 0
    }
    
    for dwell in dwell_times:
        if dwell < 1:
            dwell_buckets['0-1 min'] += 1
        elif dwell < 5:
            dwell_buckets['1-5 min'] += 1
        elif dwell < 15:
            dwell_buckets['5-15 min'] += 1
        elif dwell < 30:
            dwell_buckets['15-30 min'] += 1
        elif dwell < 60:
            dwell_buckets['30-60 min'] += 1
        elif dwell < 120:
            dwell_buckets['1-2 hours'] += 1
        elif dwell < 240:
            dwell_buckets['2-4 hours'] += 1
        elif dwell < 480:
            dwell_buckets['4-8 hours'] += 1
        else:
            dwell_buckets['8+ hours'] += 1
    
    print(f"  Total devices with dwell time: {len(dwell_times):,}")
    for bucket, count in dwell_buckets.items():
        pct = count / len(dwell_times) * 100 if dwell_times else 0
        bar = '█' * int(pct / 2)
        print(f"  {bucket:>10}: {count:>6,} ({pct:>5.1f}%) {bar}")
    
    short_dwell = dwell_buckets['0-1 min'] + dwell_buckets['1-5 min']
    short_pct = short_dwell / len(dwell_times) * 100 if dwell_times else 0
    
    if short_pct > 50:
        print(f"\n  ⚠️  {short_pct:.0f}% have <5 min dwell = drive-by traffic!")
    
    # 4. RSSI distribution
    print("\n\n4. SIGNAL STRENGTH (RSSI) DISTRIBUTION")
    print("-" * 40)
    
    rssi_values = []
    for r in data:
        rssi = r.get('avgRssi', r.get('rssi', None))
        if rssi and -100 < rssi < 0:
            rssi_values.append(rssi)
    
    if rssi_values:
        rssi_buckets = defaultdict(int)
        for rssi in rssi_values:
            bucket = (rssi // 5) * 5  # Round to nearest 5
            rssi_buckets[bucket] += 1
        
        print(f"  Total RSSI readings: {len(rssi_values):,}")
        print("  Distribution:")
        
        for rssi in sorted(rssi_buckets.keys(), reverse=True):
            count = rssi_buckets[rssi]
            pct = count / len(rssi_values) * 100
            bar = '█' * int(pct)
            
            # Interpret distance
            if rssi > -60:
                distance = "Very close (<1m)"
            elif rssi > -70:
                distance = "Close (1-3m)"
            elif rssi > -80:
                distance = "Medium (3-10m)"
            elif rssi > -90:
                distance = "Far (10-30m)"
            else:
                distance = "Very far (>30m)"
            
            print(f"  {rssi:>4} to {rssi+4:>4} dBm: {count:>6,} ({pct:>5.1f}%) {bar} {distance}")
        
        weak_signal = sum(count for rssi, count in rssi_buckets.items() if rssi < -85)
        weak_pct = weak_signal / len(rssi_values) * 100
        
        if weak_pct > 30:
            print(f"\n  ⚠️  {weak_pct:.0f}% have weak signal = capturing street traffic!")
    
    # 5. Location coverage
    print("\n\n5. LOCATION COVERAGE")
    print("-" * 40)
    
    location_counts = Counter()
    multi_location = 0
    
    for r in data:
        locations = r.get('locationsVisited', [])
        if not locations:
            locations = [r.get('Location', 'Unknown')]
        
        for loc in locations:
            location_counts[loc] += 1
        
        if len(locations) > 1:
            multi_location += 1
    
    print("  Devices by location:")
    for loc, count in location_counts.most_common():
        pct = count / len(data) * 100
        print(f"    {loc}: {count:,} ({pct:.1f}%)")
    
    print(f"\n  Multi-location visitors: {multi_location:,} ({multi_location/len(data)*100:.1f}%)")
    
    # 6. Final diagnosis
    print("\n\n6. DIAGNOSIS")
    print("=" * 60)
    
    issues = []
    
    if avg_entropy > 0.9:
        issues.append("Address randomization/hashing detected")
    
    if len(oui_counts) / len(addresses) > 0.3:
        issues.append("Abnormal OUI diversity (addresses being transformed)")
    
    if short_pct > 50:
        issues.append(f"{short_pct:.0f}% are drive-by traffic (need dwell filter)")
    
    if weak_pct > 30:
        issues.append(f"{weak_pct:.0f}% are too far away (need RSSI filter)")
    
    if len(data) > 10000:
        issues.append("Capturing way too many devices for venue size")
    
    print("\nDETECTED ISSUES:")
    for i, issue in enumerate(issues, 1):
        print(f"  {i}. {issue}")
    
    print("\n\nRECOMMENDATIONS:")
    print("  1. Your scanner is definitely transforming addresses")
    print("  2. Apply strict filters: >15 min dwell, >-85 dBm RSSI")
    print("  3. Focus only on Random Static addresses")
    print("  4. Even then, expect inflated numbers due to address transformation")
    print("  5. Contact scanner vendor - this system is not suitable for visitor analytics")
    
    # Check specific scanner behavior
    print("\n\nSCANNER TYPE DETECTION:")
    
    # Look for patterns in the 4-day visitors
    four_day_visitors = []
    
    for date in ["20250807", "20250808", "20250809", "20250810"]:
        file = os.path.join(DATA_DIR, f"Picklecon_combineddwell_flat_{date}.json")
        if os.path.exists(file):
            with open(file, 'r') as f:
                day_data = json.load(f)
                for r in day_data:
                    if r.get('dwellTime', 0) > 720:  # >12 hours
                        four_day_visitors.append(r.get('address'))
    
    four_day_unique = len(set(four_day_visitors))
    
    print(f"\n  Infrastructure devices (>12hr dwell): {four_day_unique}")
    
    if four_day_unique < 50 and len(data) > 50000:
        print("\n  🔴 VERDICT: Privacy-preserving scanner with daily key rotation")
        print("     - Addresses are hashed with: hash(real_MAC + daily_salt)")
        print("     - Only infrastructure MACs are whitelisted")
        print("     - True visitor tracking is IMPOSSIBLE with this system")

if __name__ == "__main__":
    check_scanner_issues()