#!/usr/bin/env python3
"""
Advanced BLE Analysis with Corrected Address Classification
Based on BLE Core Specification v5.4 and latest research
"""

import json
import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from collections import defaultdict, Counter
import warnings
import time
from tqdm import tqdm
warnings.filterwarnings('ignore')

# Configuration
DATA_DIR = "/var/www/html/CrowdFlow/Picklecon"  # Update this path
FILE_PREFIX = "Picklecon"  # Update this to match your directory name prefix
EVENT_DATES = ["20250807"]  # Update these dates
RESULTS_DIR = "./analysis_results"

# Optional: specify hour range if event doesn't run 24 hours
START_HOUR = 10   # First hour to check (0-23)
END_HOUR = 16    # Last hour to check (0-23)

# Performance options
IOS_ANALYSIS_SAMPLE_RATE = 1.0  # Use 0.1 to sample 10% of time windows for faster iOS analysis
MAX_ADDRESSES_PER_WINDOW = 1000  # Limit addresses per window to prevent excessive comparisons

# Analysis parameters based on research
RSSI_BIN_WIDTH = 5  # dBm bins for RSSI histograms
MIN_RSSI_SAMPLES = 10  # Minimum RSSI readings for reliable fingerprint
IOS_ROTATION_WINDOW = 15  # minutes - iOS MAC rotation cycle
DBSCAN_EPS = 0.3  # DBSCAN epsilon for clustering
MIN_CLUSTER_SIZE = 2  # Minimum devices to form a cluster
CONFIDENCE_THRESHOLD = 0.75  # 75% confidence for cross-day linking
DWELL_TIME_THRESHOLD = 15  # minutes - gap before considering new visit

# Known BLE manufacturer OUIs
KNOWN_MANUFACTURER_OUIS = {
    "0018DE": "Nordic Semiconductor",
    "AC233F": "Shenzhen Minew",
    "000B57": "Silicon Laboratories", 
    "0002F7": "Cypress Semiconductor",
    "00A050": "Cypress",
    "D4F513": "Texas Instruments",
    "24E124": "Apple",
    "38F9D3": "Apple",
    "44E66E": "Apple",
    "6C96CF": "Apple",
    "78D75F": "Apple",
    "88E87F": "Apple",
    "9CF48E": "Apple",
    "AC233F": "Minew",
    "5A6D5A": "Estimote",
    "0025DF": "Kontakt.io",
    "00158D": "Xiaomi",
    "A4C138": "Telink",
    "AC83F3": "Espressif",
    "30AEA4": "Espressif",
}

class BLEFingerprinter:
    def __init__(self):
        self.devices = {}
        self.fingerprints = {}
        self.clusters = {}
        self.cross_day_links = defaultdict(list)
        self.file_loading_log = []  # Track which files were loaded
        self.infrastructure_devices = set()  # Track infrastructure
        self.classic_bt_devices = set()  # Track Classic Bluetooth devices
        
        # Create results directory
        os.makedirs(RESULTS_DIR, exist_ok=True)
    
    def normalize_address(self, address):
        """Normalize MAC address format"""
        if isinstance(address, (int, float)):
            address = str(int(address))
        else:
            address = str(address)
        
        address = address.replace(':', '').upper()
        if len(address) < 12:
            address = address.zfill(12)
        return address[:12] if len(address) > 12 else address
    
    def classify_address(self, address):
        """Classify MAC address type based on BLE Core Specification"""
        if len(address) != 12:
            return "Invalid"
        
        try:
            # Get the most significant byte
            msb = int(address[:2], 16)
            
            # Check the two most significant bits [47:46]
            type_bits = (msb >> 6) & 0x03
            
            # Random address types based on most significant bits
            if type_bits == 0b11:  # 11xxxxxx
                return "Random Static"
            elif type_bits == 0b01:  # 01xxxxxx
                return "Random Private Resolvable"
            elif type_bits == 0b00:  # 00xxxxxx
                return "Random Private Non-Resolvable"
            else:  # 10xxxxxx
                # This pattern (10) could be either public or reserved
                # Without TxAdd flag, we need additional heuristics
                
                # Check if it matches known manufacturer OUI
                oui = address[:6]
                if self.is_known_manufacturer_oui(oui):
                    return "Public"
                
                # Check for other public address indicators
                # Public addresses should have valid OUI structure
                if self.is_valid_oui_structure(address):
                    return "Public (Unregistered OUI)"
                else:
                    return "Ambiguous"
        except:
            return "Invalid"
    
    def is_known_manufacturer_oui(self, oui):
        """Check if OUI belongs to known manufacturers"""
        return oui.upper() in KNOWN_MANUFACTURER_OUIS
    
    def is_valid_oui_structure(self, address):
        """Check if address has valid OUI structure for public address"""
        # Public addresses typically don't have all zeros or all ones in OUI
        oui = address[:6]
        if oui == "000000" or oui == "FFFFFF":
            return False
        
        # Check for other patterns that indicate non-public addresses
        # Many random addresses have patterns like XX:XX:XX where bytes repeat
        if address[0:2] == address[2:4] == address[4:6]:
            return False
        
        return True
    
    def is_likely_infrastructure(self, device_fingerprint):
        """Detect likely infrastructure devices"""
        indicators = 0
        
        # Check for very consistent RSSI (low std deviation)
        if device_fingerprint.get('rssi_std', float('inf')) < 3:
            indicators += 1
        
        # Check for extended presence (2+ hours)
        if device_fingerprint.get('total_dwell_time', 0) > 120:
            indicators += 1
        
        # Check for regular beacon-like intervals
        avg_interval = device_fingerprint.get('avg_interval', 0)
        if 0.9 < avg_interval < 1.1:  # ~1 second intervals
            indicators += 1
        
        # Check for low interval variance (very regular)
        if device_fingerprint.get('interval_std', float('inf')) < 0.1:
            indicators += 1
        
        # Check for single location (stationary)
        if device_fingerprint.get('location_diversity', float('inf')) == 1:
            indicators += 1
        
        # Check for low burst ratio (regular, not bursty)
        if device_fingerprint.get('burst_ratio', 1) < 0.1:
            indicators += 1
        
        return indicators >= 3  # Need at least 3 indicators
    
    def is_likely_classic_bluetooth(self, address, device_fingerprint):
        """Detect likely Classic Bluetooth devices based on behavioral patterns"""
        indicators = 0
        
        # Classic BT indicators:
        
        # 1. Very stable RSSI (Classic doesn't hop as much as BLE)
        if device_fingerprint.get('rssi_std', float('inf')) < 5:
            indicators += 1
        
        # 2. Classic BT typical advertising intervals (1.28s, 2.56s)
        avg_interval = device_fingerprint.get('avg_interval', 0)
        classic_intervals = [1.28, 2.56, 0.64, 5.12]
        for interval in classic_intervals:
            if abs(avg_interval - interval) < 0.1:
                indicators += 2  # Strong indicator
                break
        
        # 3. Extended continuous presence without gaps
        if device_fingerprint.get('total_dwell_time', 0) > 30:
            indicators += 1
        
        # 4. No rotation behavior (should not cluster with others)
        if device_fingerprint.get('session_count', 0) == 1:
            indicators += 1
        
        # 5. Check if it's NOT in a cluster (Classic doesn't rotate)
        addr_in_cluster = False
        for date_clusters in self.clusters.values():
            for cluster_addresses in date_clusters.values():
                if address in cluster_addresses and len(cluster_addresses) > 1:
                    addr_in_cluster = True
                    break
        if not addr_in_cluster:
            indicators += 1
        
        # 6. Known Classic BT OUI patterns (audio, computer, automotive)
        classic_ouis = {
            # Audio manufacturers
            "000A95": "Bose",
            "04E676": "Sony", 
            "006037": "JBL",
            "00166C": "Beats",
            "B8F6B1": "Apple (AirPods)",
            "AC9E17": "Apple (AirPods)",
            # Computer manufacturers
            "001E37": "Dell",
            "00215C": "HP",
            "001560": "Lenovo",
            "B86B23": "Dell",
            # Automotive
            "002128": "BMW",
            "9CFDF0": "Tesla",
            "04F8C2": "Mercedes-Benz",
            # Peripherals
            "0004B0": "Logitech",
            "00126F": "Microsoft",
        }
        
        oui = address[:6]
        if oui in classic_ouis:
            indicators += 2  # Strong indicator
        
        # 7. Very consistent presence (no 15-min rotation pattern)
        if device_fingerprint.get('burst_ratio', 1) < 0.05:
            indicators += 1
        
        return indicators >= 3  # Need at least 3 indicators
    
    def aggregate_raw_records(self, raw_records):
        """Aggregate raw records into device sessions"""
        # Sort by timestamp
        raw_records.sort(key=lambda x: x['timestamp'])
        
        sessions = []
        current_session = None
        
        for record in raw_records:
            if current_session is None:
                # Start new session
                current_session = {
                    'firstSeenTime': record['timestamp'],
                    'lastSeenTime': record['timestamp'],
                    'rssi_values': [record['rssi']],
                    'locations': [record['Location']],
                    'timestamps': [record['timestamp']],
                    'state_changes': [record.get('state', 'unknown')]
                }
            else:
                # Check if this is part of the same session
                last_time = pd.to_datetime(current_session['lastSeenTime'])
                current_time = pd.to_datetime(record['timestamp'])
                time_gap = (current_time - last_time).total_seconds() / 60  # minutes
                
                if time_gap <= DWELL_TIME_THRESHOLD:
                    # Same session
                    current_session['lastSeenTime'] = record['timestamp']
                    current_session['rssi_values'].append(record['rssi'])
                    current_session['locations'].append(record['Location'])
                    current_session['timestamps'].append(record['timestamp'])
                    current_session['state_changes'].append(record.get('state', 'unknown'))
                else:
                    # New session
                    sessions.append(current_session)
                    current_session = {
                        'firstSeenTime': record['timestamp'],
                        'lastSeenTime': record['timestamp'],
                        'rssi_values': [record['rssi']],
                        'locations': [record['Location']],
                        'timestamps': [record['timestamp']],
                        'state_changes': [record.get('state', 'unknown')]
                    }
        
        # Don't forget the last session
        if current_session:
            sessions.append(current_session)
        
        # Calculate derived metrics for each session
        for session in sessions:
            first_time = pd.to_datetime(session['firstSeenTime'])
            last_time = pd.to_datetime(session['lastSeenTime'])
            session['dwellTime'] = (last_time - first_time).total_seconds() / 60  # minutes
            session['recordCount'] = len(session['rssi_values'])
            session['avgRssi'] = np.mean(session['rssi_values'])
            session['locationsVisited'] = list(set(session['locations']))
        
        return sessions
    
    def extract_features(self, device_data):
        """Extract multi-modal features for fingerprinting"""
        features = {}
        
        # RSSI fingerprint - create histogram
        if 'rssi_values' in device_data and len(device_data['rssi_values']) >= MIN_RSSI_SAMPLES:
            rssi_hist, _ = np.histogram(device_data['rssi_values'], 
                                       bins=range(-100, -40, RSSI_BIN_WIDTH))
            features['rssi_histogram'] = rssi_hist / len(device_data['rssi_values'])
            features['rssi_mean'] = np.mean(device_data['rssi_values'])
            features['rssi_std'] = np.std(device_data['rssi_values'])
            features['rssi_skew'] = stats.skew(device_data['rssi_values'])
        
        # Timing patterns
        if 'timestamps' in device_data and len(device_data['timestamps']) > 1:
            times = sorted(device_data['timestamps'])
            intervals = np.diff(times)
            if len(intervals) > 0:
                features['avg_interval'] = np.mean(intervals)
                features['interval_std'] = np.std(intervals)
                features['burst_ratio'] = np.sum(intervals < 1) / len(intervals)
        
        # Location patterns
        if 'locations' in device_data:
            loc_counts = Counter(device_data['locations'])
            total_visits = sum(loc_counts.values())
            features['location_entropy'] = stats.entropy(list(loc_counts.values()))
            features['primary_location_ratio'] = max(loc_counts.values()) / total_visits
            features['location_diversity'] = len(set(device_data['locations']))
        
        # Behavioral features
        features['total_dwell_time'] = device_data.get('total_dwell_time', 0)
        features['session_count'] = device_data.get('session_count', 0)
        features['total_records'] = device_data.get('total_records', 0)
        features['first_seen_hour'] = device_data.get('first_seen_hour', 0)
        
        return features
    
    def compute_fingerprint_similarity(self, fp1, fp2):
        """Compute similarity between two fingerprints"""
        similarity_scores = []
        
        # RSSI histogram similarity (if both have it)
        if 'rssi_histogram' in fp1 and 'rssi_histogram' in fp2:
            # Cosine similarity
            hist_sim = np.dot(fp1['rssi_histogram'], fp2['rssi_histogram']) / (
                np.linalg.norm(fp1['rssi_histogram']) * np.linalg.norm(fp2['rssi_histogram'])
            )
            similarity_scores.append(hist_sim * 2)  # Weight RSSI heavily
        
        # Behavioral similarity
        behavioral_features = ['total_dwell_time', 'location_entropy', 'primary_location_ratio']
        for feat in behavioral_features:
            if feat in fp1 and feat in fp2:
                # Normalized difference
                max_val = max(abs(fp1[feat]), abs(fp2[feat]), 1)
                sim = 1 - abs(fp1[feat] - fp2[feat]) / max_val
                similarity_scores.append(sim)
        
        # Timing similarity
        if 'avg_interval' in fp1 and 'avg_interval' in fp2:
            interval_sim = 1 - abs(fp1['avg_interval'] - fp2['avg_interval']) / max(
                fp1['avg_interval'], fp2['avg_interval'], 1
            )
            similarity_scores.append(interval_sim)
        
        return np.mean(similarity_scores) if similarity_scores else 0
    
    def cluster_devices_dbscan(self, date_devices):
        """Use DBSCAN to cluster devices based on fingerprints"""
        from sklearn.cluster import DBSCAN
        from sklearn.preprocessing import StandardScaler
        
        # Extract feature matrix
        addresses = list(date_devices.keys())
        feature_matrix = []
        
        for addr in addresses:
            fp = self.fingerprints.get(addr, {})
            # Create feature vector
            features = [
                fp.get('rssi_mean', -80),
                fp.get('rssi_std', 10),
                fp.get('total_dwell_time', 0),
                fp.get('location_entropy', 0),
                fp.get('primary_location_ratio', 1),
                fp.get('avg_interval', 60),
                fp.get('first_seen_hour', 12)
            ]
            feature_matrix.append(features)
        
        if len(feature_matrix) < MIN_CLUSTER_SIZE:
            return {}
        
        # Standardize features
        scaler = StandardScaler()
        X = scaler.fit_transform(feature_matrix)
        
        # Apply DBSCAN
        clustering = DBSCAN(eps=DBSCAN_EPS, min_samples=MIN_CLUSTER_SIZE)
        labels = clustering.fit_predict(X)
        
        # Group by cluster
        clusters = defaultdict(list)
        for i, label in enumerate(labels):
            if label != -1:  # Not noise
                clusters[label].append(addresses[i])
        
        return dict(clusters)
    
    def analyze_ios_rotation_windows_optimized(self, df):
        """Optimized iOS rotation analysis with progress tracking"""
        # Create 15-minute bins
        df['time_bin'] = pd.to_datetime(df['timestamp']).dt.floor(f'{IOS_ROTATION_WINDOW}min')
        
        rotation_analysis = []
        time_bins = df['time_bin'].unique()
        
        # Sample time windows if requested
        if IOS_ANALYSIS_SAMPLE_RATE < 1.0:
            sample_size = max(1, int(len(time_bins) * IOS_ANALYSIS_SAMPLE_RATE))
            time_bins = np.random.choice(time_bins, size=sample_size, replace=False)
            print(f"  Sampling {len(time_bins)} of {df['time_bin'].nunique()} time windows")
        
        # Progress bar for time windows
        for time_bin in tqdm(time_bins, desc="  Analyzing iOS rotation windows"):
            group = df[df['time_bin'] == time_bin]
            addresses = group['address'].unique()
            
            # Limit addresses per window if too many
            if len(addresses) > MAX_ADDRESSES_PER_WINDOW:
                addresses = np.random.choice(addresses, size=MAX_ADDRESSES_PER_WINDOW, replace=False)
            
            # Look for similar RSSI patterns in this window
            rssi_patterns = {}
            for _, row in group.iterrows():
                addr = row['address']
                if addr in addresses:  # Only process selected addresses
                    if addr not in rssi_patterns:
                        rssi_patterns[addr] = []
                    rssi_patterns[addr].append(row['rssi'])
            
            # Find potential rotations (similar RSSI in same window)
            potential_rotations = []
            addrs = list(rssi_patterns.keys())
            
            # Use vectorized computation for efficiency
            if len(addrs) > 1:
                # Calculate mean RSSI for each address
                mean_rssis = {addr: np.mean(rssi_patterns[addr]) for addr in addrs}
                
                # Only compare if we have enough samples
                valid_addrs = [addr for addr in addrs if len(rssi_patterns[addr]) >= 3]
                
                for i in range(len(valid_addrs)):
                    for j in range(i+1, len(valid_addrs)):
                        rssi_diff = abs(mean_rssis[valid_addrs[i]] - mean_rssis[valid_addrs[j]])
                        if rssi_diff < 5:  # Within 5 dBm
                            potential_rotations.append((valid_addrs[i], valid_addrs[j], rssi_diff))
            
            rotation_analysis.append({
                'time_bin': time_bin,
                'unique_addresses': len(addresses),
                'potential_rotations': len(potential_rotations)
            })
        
        return pd.DataFrame(rotation_analysis)
    
    def analyze_ble_patterns(self):
        """Analyze BLE-specific patterns in the data"""
        print("\n--- BLE Pattern Analysis ---")
        
        # Get first date's data for analysis
        if not EVENT_DATES or EVENT_DATES[0] not in self.devices:
            print("  No data available for BLE pattern analysis")
            return
        
        first_date_devices = self.devices[EVENT_DATES[0]]
        
        # Analyze advertisement intervals
        interval_patterns = defaultdict(int)
        for device in first_date_devices.values():
            fp = self.fingerprints.get(device['address'], {})
            if 'avg_interval' in fp:
                interval = fp['avg_interval']
                # Round to nearest 100ms
                interval_bucket = round(interval * 10) / 10
                interval_patterns[interval_bucket] += 1
        
        print("\n  Top advertisement interval patterns:")
        for interval, count in sorted(interval_patterns.items(), key=lambda x: x[1], reverse=True)[:10]:
            print(f"    {interval:.1f}s: {count} devices")
        
        # Detect iBeacon/Eddystone patterns
        beacon_candidates = 0
        for addr, device in first_date_devices.items():
            fp = self.fingerprints.get(addr, {})
            # Beacons typically have very regular intervals and consistent RSSI
            if (fp.get('interval_std', float('inf')) < 0.1 and 
                fp.get('rssi_std', float('inf')) < 5 and
                fp.get('burst_ratio', 0) < 0.1):
                beacon_candidates += 1
        
        print(f"\n  Likely beacon devices: {beacon_candidates}")
        
        # Analyze address type patterns by hour
        hourly_types = defaultdict(lambda: defaultdict(int))
        for session in self.df.itertuples():
            hour = session.firstSeenTime.hour
            hourly_types[hour][session.address_type] += 1
        
        print("\n  Address type distribution by hour:")
        for hour in sorted(hourly_types.keys()):
            print(f"    Hour {hour:02d}:")
            for addr_type, count in sorted(hourly_types[hour].items()):
                print(f"      {addr_type}: {count}")
    
    def probabilistic_cross_day_linking(self, day1_data, day2_data):
        """Link devices across days using probabilistic matching"""
        links = []
        
        for addr1, fp1 in day1_data.items():
            best_match = None
            best_score = 0
            
            for addr2, fp2 in day2_data.items():
                # Skip if same address (unlikely with randomization)
                if addr1 == addr2:
                    continue
                
                # Compute similarity
                similarity = self.compute_fingerprint_similarity(fp1, fp2)
                
                if similarity > best_score and similarity >= CONFIDENCE_THRESHOLD:
                    best_score = similarity
                    best_match = addr2
            
            if best_match:
                links.append({
                    'day1_address': addr1,
                    'day2_address': best_match,
                    'confidence': best_score,
                    'day1_type': self.classify_address(addr1),
                    'day2_type': self.classify_address(addr2)
                })
        
        return links
    
    def load_and_process_data(self):
        """Load data and create fingerprints"""
        all_data = []
        
        for date in EVENT_DATES:
            print(f"\nProcessing date: {date}")
            
            # Collect all data for this date across all hours
            date_raw_data = []
            hours_found = []
            
            # Try to load hourly files
            for hour in range(START_HOUR, END_HOUR + 1):
                hour_str = f"{hour:02d}"  # Format as 00, 01, ..., 23
                
                # Build filename for this hour
                # Using the configured prefix
                filename = os.path.join(DATA_DIR, f"{FILE_PREFIX}_combined_flat_{date}_{hour_str}.json")
                
                if os.path.exists(filename):
                    hours_found.append(hour_str)
                    try:
                        with open(filename, 'r') as f:
                            hour_data = json.load(f)
                            date_raw_data.extend(hour_data)
                            print(f"  ✓ Loaded hour {hour_str}: {len(hour_data)} records")
                            
                            # Track loading info
                            self.file_loading_log.append({
                                'date': date,
                                'hour': hour_str,
                                'filename': filename,
                                'exists': True,
                                'records': len(hour_data),
                                'status': 'success'
                            })
                    except Exception as e:
                        print(f"  ✗ Error loading hour {hour_str}: {e}")
                        self.file_loading_log.append({
                            'date': date,
                            'hour': hour_str,
                            'filename': filename,
                            'exists': True,
                            'records': 0,
                            'status': f'error: {str(e)}'
                        })
                else:
                    self.file_loading_log.append({
                        'date': date,
                        'hour': hour_str,
                        'filename': filename,
                        'exists': False,
                        'records': 0,
                        'status': 'missing'
                    })
                
            if not date_raw_data:
                print(f"  ⚠️  No data found for {date}")
                continue
            
            print(f"  Total records for {date}: {len(date_raw_data)} from hours {', '.join(hours_found)}")
            
            # Show hourly distribution
            hour_distribution = defaultdict(int)
            for record in date_raw_data:
                try:
                    ts = pd.to_datetime(record.get('timestamp'))
                    hour_distribution[ts.hour] += 1
                except:
                    pass
            
            if hour_distribution:
                print(f"  Hourly distribution: {dict(sorted(hour_distribution.items()))}")
            
            # Group raw records by device address
            print(f"  Grouping records by device address...")
            raw_by_device = defaultdict(list)
            
            for record in tqdm(date_raw_data, desc="  Processing records"):
                address = self.normalize_address(record.get('address', ''))
                if not address:
                    continue
                
                # Ensure timestamp is properly formatted
                if 'timestamp' in record:
                    try:
                        # Parse and reformat timestamp if needed
                        ts = pd.to_datetime(record['timestamp'])
                        record['timestamp'] = ts
                    except:
                        continue
                
                raw_by_device[address].append(record)
            
            # Process each device's raw records
            print(f"  Aggregating {len(raw_by_device)} devices into sessions...")
            date_devices = {}
            
            for address, raw_records in tqdm(raw_by_device.items(), desc="  Creating sessions"):
                # Aggregate raw records into sessions
                sessions = self.aggregate_raw_records(raw_records)
                
                if not sessions:
                    continue
                
                # Create device data structure
                date_devices[address] = {
                    'sessions': sessions,
                    'rssi_values': [],
                    'timestamps': [],
                    'locations': [],
                    'address': address,
                    'date': date,
                    'total_dwell_time': sum(s['dwellTime'] for s in sessions),
                    'session_count': len(sessions),
                    'total_records': sum(s['recordCount'] for s in sessions)
                }
                
                # Aggregate all values across sessions
                for session in sessions:
                    date_devices[address]['rssi_values'].extend(session['rssi_values'])
                    date_devices[address]['locations'].extend(session['locations'])
                    
                    # Convert timestamps to float for processing
                    for ts in session['timestamps']:
                        date_devices[address]['timestamps'].append(
                            pd.to_datetime(ts).timestamp()
                        )
                
                # Get first seen hour
                first_ts = pd.to_datetime(sessions[0]['firstSeenTime'])
                date_devices[address]['first_seen_hour'] = first_ts.hour
                
                # Add aggregated sessions to DataFrame format
                for session in sessions:
                    session_record = {
                        'date': date,
                        'address': address,
                        'address_normalized': address,
                        'address_type': self.classify_address(address),
                        'firstSeenTime': session['firstSeenTime'],
                        'lastSeenTime': session['lastSeenTime'],
                        'dwellTime': session['dwellTime'],
                        'recordCount': session['recordCount'],
                        'avgRssi': session['avgRssi'],
                        'locationsVisited': session['locationsVisited']
                    }
                    all_data.append(session_record)
            
            # Create fingerprints
            print(f"  Creating fingerprints for {len(date_devices)} devices...")
            for addr, device_data in tqdm(date_devices.items(), desc="  Fingerprinting"):
                self.fingerprints[addr] = self.extract_features(device_data)
            
            # Identify infrastructure devices
            print("  Identifying infrastructure devices...")
            for addr, fp in self.fingerprints.items():
                if self.is_likely_infrastructure(fp):
                    self.infrastructure_devices.add(addr)
            print(f"  Found {len(self.infrastructure_devices)} infrastructure devices")
            
            # Identify Classic Bluetooth devices
            print("  Identifying Classic Bluetooth devices...")
            for addr, fp in self.fingerprints.items():
                if addr not in self.infrastructure_devices:  # Don't double-count
                    if self.is_likely_classic_bluetooth(addr, fp):
                        self.classic_bt_devices.add(addr)
            print(f"  Found {len(self.classic_bt_devices)} Classic Bluetooth devices")
            
            # Cluster devices
            print("  Clustering devices...")
            clusters = self.cluster_devices_dbscan(date_devices)
            self.clusters[date] = clusters
            print(f"  Found {len(clusters)} clusters")
            
            # Store for cross-day analysis
            self.devices[date] = date_devices
        
        # Create DataFrame
        self.df = pd.DataFrame(all_data)
        if not self.df.empty:
            self.df['firstSeenTime'] = pd.to_datetime(self.df['firstSeenTime'])
            self.df['lastSeenTime'] = pd.to_datetime(self.df['lastSeenTime'])
            
            # Print loading summary
            print("\n=== DATA LOADING SUMMARY ===")
            print(f"Total dates processed: {len(self.devices)}")
            print(f"Total unique devices: {len(self.fingerprints)}")
            print(f"Total sessions: {len(self.df)}")
            print(f"Date range: {self.df['date'].min()} to {self.df['date'].max()}")
            print(f"Hour range checked: {START_HOUR:02d}:00 to {END_HOUR:02d}:59")
    
    def analyze_and_visualize(self):
        """Perform analysis and create visualizations"""
        print("\n=== ADVANCED BLE ANALYSIS RESULTS ===")
        
        # 1. Address type distribution
        plt.figure(figsize=(12, 6))
        
        plt.subplot(1, 2, 1)
        type_counts = self.df['address_type'].value_counts()
        type_counts.plot(kind='bar')
        plt.title('Address Type Distribution')
        plt.xticks(rotation=45)
        plt.tight_layout()
        
        # 2. RSSI fingerprint visualization
        plt.subplot(1, 2, 2)
        rssi_data = []
        for fp in self.fingerprints.values():
            if 'rssi_mean' in fp:
                rssi_data.append([fp['rssi_mean'], fp.get('rssi_std', 0)])
        
        if rssi_data:
            rssi_df = pd.DataFrame(rssi_data, columns=['RSSI Mean', 'RSSI Std'])
            plt.scatter(rssi_df['RSSI Mean'], rssi_df['RSSI Std'], alpha=0.5)
            plt.xlabel('RSSI Mean (dBm)')
            plt.ylabel('RSSI Std Dev')
            plt.title('RSSI Fingerprint Distribution')
        
        plt.savefig(os.path.join(RESULTS_DIR, 'address_analysis.png'))
        plt.close()
        
        # 3. iOS rotation window analysis
        print("\n--- iOS Rotation Window Analysis ---")
        for date in EVENT_DATES:
            if date in self.devices:
                date_df = self.df[self.df['date'] == date]
                if not date_df.empty:
                    print(f"\n  Analyzing iOS rotations for {date}...")
                    start_time = time.time()
                    
                    # Need to create a raw format DataFrame for rotation analysis
                    raw_df_data = []
                    for addr, device in self.devices[date].items():
                        for session in device.get('sessions', []):
                            for i, ts in enumerate(session['timestamps']):
                                raw_df_data.append({
                                    'timestamp': ts,
                                    'address': addr,
                                    'rssi': session['rssi_values'][i]
                                })
                    
                    if raw_df_data:
                        raw_df = pd.DataFrame(raw_df_data)
                        print(f"  Total raw records: {len(raw_df)}")
                        
                        # Use optimized analysis
                        rotation_df = self.analyze_ios_rotation_windows_optimized(raw_df)
                        
                        avg_addresses = rotation_df['unique_addresses'].mean()
                        avg_rotations = rotation_df['potential_rotations'].mean()
                        
                        print(f"  {date}: Avg {avg_addresses:.1f} addresses per 15min, "
                              f"{avg_rotations:.1f} potential rotations")
                        print(f"  Analysis completed in {time.time() - start_time:.1f} seconds")
                        
                        # Save rotation analysis
                        rotation_df.to_csv(os.path.join(RESULTS_DIR, f'ios_rotation_analysis_{date}.csv'), index=False)
        
        # 4. Cross-day linking
        print("\n--- Cross-Day Probabilistic Linking ---")
        dates = sorted(self.devices.keys())
        
        cross_day_summary = []
        for i in range(len(dates)-1):
            date1, date2 = dates[i], dates[i+1]
            
            # Get fingerprints for each day
            day1_fps = {addr: self.fingerprints[addr] 
                       for addr in self.devices[date1].keys() 
                       if addr in self.fingerprints}
            day2_fps = {addr: self.fingerprints[addr] 
                       for addr in self.devices[date2].keys() 
                       if addr in self.fingerprints}
            
            # Perform linking
            links = self.probabilistic_cross_day_linking(day1_fps, day2_fps)
            
            if links:
                link_df = pd.DataFrame(links)
                high_conf_links = link_df[link_df['confidence'] >= 0.8]
                
                print(f"{date1} → {date2}: {len(links)} total links, "
                      f"{len(high_conf_links)} high confidence (≥80%)")
                
                cross_day_summary.append({
                    'day_pair': f"{date1}-{date2}",
                    'total_links': len(links),
                    'high_conf_links': len(high_conf_links),
                    'avg_confidence': link_df['confidence'].mean()
                })
        
        # 5. Clustering effectiveness
        print("\n--- Clustering Analysis ---")
        cluster_summary = []
        
        for date, clusters in self.clusters.items():
            if clusters:
                cluster_sizes = [len(addrs) for addrs in clusters.values()]
                cluster_summary.append({
                    'date': date,
                    'num_clusters': len(clusters),
                    'avg_cluster_size': np.mean(cluster_sizes),
                    'max_cluster_size': max(cluster_sizes),
                    'clustered_devices': sum(cluster_sizes)
                })
        
        if cluster_summary:
            cluster_df = pd.DataFrame(cluster_summary)
            print(cluster_df)
        
        # 6. Enhanced Attendance estimation
        print("\n--- Enhanced Attendance Estimation ---")
        
        # Method 1: Raw unique count (overestimate)
        daily_unique = self.df.groupby('date')['address_normalized'].nunique()
        print(f"\n1. RAW UNIQUE DEVICES (all addresses, all dwell times):")
        for date, count in daily_unique.items():
            print(f"  {date}: {count:,}")
        
        # Filter out infrastructure devices
        non_infrastructure_df = self.df[~self.df['address_normalized'].isin(self.infrastructure_devices)]
        visitor_devices = non_infrastructure_df['address_normalized'].unique()
        
        print(f"\n2. DEVICE CATEGORIZATION:")
        print(f"  Infrastructure devices: {len(self.infrastructure_devices)}")
        print(f"  Classic Bluetooth devices: {len(self.classic_bt_devices)}")
        print(f"  BLE devices: {len(visitor_devices) - len(self.classic_bt_devices)}")
        print(f"  Total visitor devices: {len(visitor_devices)}")
        
        # Separate Classic BT from BLE
        classic_bt_df = non_infrastructure_df[non_infrastructure_df['address_normalized'].isin(self.classic_bt_devices)]
        ble_df = non_infrastructure_df[~non_infrastructure_df['address_normalized'].isin(self.classic_bt_devices)]
        
        # Show Classic BT analysis
        print(f"\n3. CLASSIC BLUETOOTH ANALYSIS:")
        if len(classic_bt_df) > 0:
            classic_dwell = classic_bt_df.groupby('address_normalized')['dwellTime'].sum()
            print(f"  Total Classic BT devices: {len(self.classic_bt_devices)}")
            print(f"  Avg dwell time: {classic_dwell.mean():.1f} min")
            print(f"  Classic BT with 10+ min dwell: {len(classic_dwell[classic_dwell >= 10])}")
            
            # Classic BT address types (before reclassification)
            classic_types = classic_bt_df.groupby('address_type')['address_normalized'].nunique()
            print(f"  Address types (as originally classified):")
            for addr_type, count in classic_types.items():
                print(f"    {addr_type}: {count}")
        
        # Address type breakdown (BLE only)
        print(f"\n4. BLE DEVICE ADDRESS TYPE BREAKDOWN:")
        ble_type_counts = ble_df.groupby(['date', 'address_type'])['address_normalized'].nunique()
        for (date, addr_type), count in ble_type_counts.items():
            print(f"  {date} - {addr_type}: {count:,}")
        
        # Calculate total dwell time per device
        ble_device_dwell = ble_df.groupby(['date', 'address_normalized', 'address_type'])['dwellTime'].sum().reset_index()
        ble_device_dwell.columns = ['date', 'address', 'address_type', 'total_dwell_time']
        
        # Dwell time filtering (BLE only)
        print(f"\n5. BLE FILTERED ESTIMATES (by dwell time thresholds):")
        dwell_thresholds = [0, 5, 10, 15, 30, 60]
        
        for threshold in dwell_thresholds:
            print(f"\n  BLE devices with ≥{threshold} min total dwell time:")
            filtered = ble_device_dwell[ble_device_dwell['total_dwell_time'] >= threshold]
            
            # Overall count by date
            by_date = filtered.groupby('date')['address'].nunique()
            for date, count in by_date.items():
                print(f"    {date}: {count:,} total")
            
            # By address type for key thresholds
            if threshold in [10, 30]:
                print(f"    Breakdown by type:")
                by_type = filtered.groupby(['date', 'address_type'])['address'].nunique()
                for (date, addr_type), count in by_type.items():
                    print(f"      {addr_type}: {count:,}")
        
        # BLE visitor estimation
        print(f"\n6. BLE VISITOR ESTIMATION (10+ min dwell, excluding infrastructure & Classic BT):")
        
        # Apply rotation factors based on address type
        rotation_factors = {
            'Random Static': 1.0,
            'Public': 1.0,  # Modern phones rarely use public
            'Public (Unregistered OUI)': 1.0,
            'Random Private Resolvable': 4.0,  # iOS rotation
            'Random Private Non-Resolvable': 8.0,  # Heavy rotation
            'Ambiguous': 2.0,
            'Invalid': 1.0
        }
        
        ble_visitor_10min = ble_device_dwell[ble_device_dwell['total_dwell_time'] >= 10]
        
        adjusted_estimates = {}
        for date in ble_visitor_10min['date'].unique():
            date_data = ble_visitor_10min[ble_visitor_10min['date'] == date]
            
            # Raw count
            raw_count = len(date_data)
            
            # Type-adjusted count
            adjusted_count = 0
            for addr_type, factor in rotation_factors.items():
                type_count = len(date_data[date_data['address_type'] == addr_type])
                adjusted_count += type_count / factor
            
            adjusted_estimates[date] = {
                'raw': raw_count,
                'adjusted': int(adjusted_count)
            }
            
            print(f"\n  {date}:")
            print(f"    Raw BLE visitor count (10+ min): {raw_count:,}")
            print(f"    Rotation-adjusted BLE estimate: {int(adjusted_count):,}")
            
            # Show breakdown
            print(f"    Breakdown by type:")
            for addr_type in sorted(date_data['address_type'].unique()):
                type_count = len(date_data[date_data['address_type'] == addr_type])
                factor = rotation_factors.get(addr_type, 1.0)
                adjusted = type_count / factor
                print(f"      {addr_type}: {type_count} → {int(adjusted)} (factor: {factor}x)")
        
        # Final summary
        print(f"\n7. ATTENDANCE SUMMARY:")
        if EVENT_DATES and EVENT_DATES[0] in adjusted_estimates:
            ble_adjusted = adjusted_estimates[EVENT_DATES[0]]['adjusted']
            classic_10min = len(classic_dwell[classic_dwell >= 10]) if len(classic_bt_df) > 0 else 0
            
            print(f"  BLE visitors (adjusted): {ble_adjusted:,}")
            print(f"  Classic BT devices (10+ min): {classic_10min:,}")
            print(f"  Cluster-based estimate: {len(self.clusters.get(EVENT_DATES[0], {})):,}")
            print(f"  Infrastructure devices: {len(self.infrastructure_devices):,}")
            print(f"\n  BEST ESTIMATE (BLE adjusted + Classic BT): {ble_adjusted + classic_10min:,}")
        
        # 7. Behavioral patterns
        plt.figure(figsize=(15, 10))
        
        # Dwell time distribution by address type
        plt.subplot(2, 2, 1)
        for addr_type in non_infrastructure_df['address_type'].unique():
            type_data = non_infrastructure_df[non_infrastructure_df['address_type'] == addr_type]['dwellTime']
            if len(type_data) > 10:
                plt.hist(type_data, bins=30, alpha=0.5, label=addr_type)
        plt.xlabel('Dwell Time (minutes)')
        plt.ylabel('Count')
        plt.title('Dwell Time Distribution by Address Type (Visitors Only)')
        plt.legend()
        plt.xlim(0, 300)
        
        # Location patterns
        plt.subplot(2, 2, 2)
        location_counts = Counter()
        for locs in non_infrastructure_df['locationsVisited']:
            if isinstance(locs, list):
                location_counts.update(locs)
        
        if location_counts:
            locs, counts = zip(*location_counts.most_common(10))
            plt.bar(locs, counts)
            plt.xlabel('Location')
            plt.ylabel('Visit Count')
            plt.title('Top 10 Visited Locations (Visitors Only)')
            plt.xticks(rotation=45)
        
        # RSSI patterns over time
        plt.subplot(2, 2, 3)
        hourly_rssi = non_infrastructure_df.groupby(non_infrastructure_df['firstSeenTime'].dt.hour)['avgRssi'].mean()
        hourly_rssi.plot(marker='o')
        plt.xlabel('Hour of Day')
        plt.ylabel('Average RSSI (dBm)')
        plt.title('Average RSSI by Hour (Visitors Only)')
        plt.grid(True)
        
        # Device count by hour
        plt.subplot(2, 2, 4)
        hourly_counts = non_infrastructure_df.groupby(non_infrastructure_df['firstSeenTime'].dt.hour)['address_normalized'].nunique()
        hourly_counts.plot(kind='bar')
        plt.xlabel('Hour of Day')
        plt.ylabel('Unique Devices')
        plt.title('Unique Devices by Hour (Visitors Only)')
        
        plt.tight_layout()
        plt.savefig(os.path.join(RESULTS_DIR, 'behavioral_analysis.png'))
        plt.close()
        
        # 8. Data coverage heatmap
        self.create_coverage_heatmap()
        
        # Save detailed results
        self.save_results()
    
    def create_coverage_heatmap(self):
        """Create a heatmap showing data coverage by date and hour"""
        # Create a matrix of dates x hours
        hours_in_range = END_HOUR - START_HOUR + 1
        coverage_matrix = np.zeros((len(EVENT_DATES), hours_in_range))
        
        for i, date in enumerate(EVENT_DATES):
            if date in self.devices:
                # Count records per hour for this date
                date_df = self.df[self.df['date'] == date]
                for j, hour in enumerate(range(START_HOUR, END_HOUR + 1)):
                    hour_count = len(date_df[date_df['firstSeenTime'].dt.hour == hour])
                    coverage_matrix[i, j] = hour_count
        
        # Create heatmap
        plt.figure(figsize=(12, 6))
        sns.heatmap(coverage_matrix, 
                    xticklabels=range(START_HOUR, END_HOUR + 1),
                    yticklabels=EVENT_DATES,
                    cmap='YlOrRd',
                    annot=True,
                    fmt='.0f',
                    cbar_kws={'label': 'Number of Sessions'})
        plt.xlabel('Hour of Day')
        plt.ylabel('Date')
        plt.title('Data Coverage Heatmap - Sessions by Date and Hour')
        plt.tight_layout()
        plt.savefig(os.path.join(RESULTS_DIR, 'data_coverage_heatmap.png'))
        plt.close()
    
    def save_results(self):
        """Save analysis results to files"""
        # Save fingerprints
        fingerprint_data = []
        for addr, fp in self.fingerprints.items():
            fp_record = {
                'address': addr, 
                'is_infrastructure': addr in self.infrastructure_devices,
                'is_classic_bt': addr in self.classic_bt_devices,
                **fp
            }
            fingerprint_data.append(fp_record)
        
        fp_df = pd.DataFrame(fingerprint_data)
        fp_df.to_csv(os.path.join(RESULTS_DIR, 'device_fingerprints.csv'), index=False)
        
        # Save infrastructure devices
        infra_df = pd.DataFrame({
            'address': list(self.infrastructure_devices),
            'type': 'infrastructure'
        })
        infra_df.to_csv(os.path.join(RESULTS_DIR, 'infrastructure_devices.csv'), index=False)
        
        # Save Classic Bluetooth devices
        classic_df = pd.DataFrame({
            'address': list(self.classic_bt_devices),
            'type': 'classic_bluetooth'
        })
        classic_df.to_csv(os.path.join(RESULTS_DIR, 'classic_bt_devices.csv'), index=False)
        
        # Save clusters
        cluster_data = []
        for date, clusters in self.clusters.items():
            for cluster_id, addresses in clusters.items():
                for addr in addresses:
                    cluster_data.append({
                        'date': date,
                        'cluster_id': cluster_id,
                        'address': addr
                    })
        
        if cluster_data:
            cluster_df = pd.DataFrame(cluster_data)
            cluster_df.to_csv(os.path.join(RESULTS_DIR, 'device_clusters.csv'), index=False)
        
        # Save session summary with corrected address types
        session_data = []
        for date, devices in self.devices.items():
            for addr, device in devices.items():
                for session in device.get('sessions', []):
                    session_data.append({
                        'date': date,
                        'address': addr,
                        'address_type': self.classify_address(addr),
                        'is_infrastructure': addr in self.infrastructure_devices,
                        'is_classic_bt': addr in self.classic_bt_devices,
                        'firstSeenTime': session['firstSeenTime'],
                        'lastSeenTime': session['lastSeenTime'],
                        'dwellTime': session['dwellTime'],
                        'recordCount': session['recordCount'],
                        'avgRssi': session['avgRssi'],
                        'locations': ','.join(session['locationsVisited'])
                    })
        
        if session_data:
            session_df = pd.DataFrame(session_data)
            session_df.to_csv(os.path.join(RESULTS_DIR, 'device_sessions_enhanced.csv'), index=False)
        
        # Save file loading summary
        self.save_loading_summary()
        
        print(f"\nResults saved to {RESULTS_DIR}/")
    
    def save_loading_summary(self):
        """Save a summary of which hourly files were loaded"""
        if self.file_loading_log:
            summary_df = pd.DataFrame(self.file_loading_log)
            summary_df.to_csv(os.path.join(RESULTS_DIR, 'file_loading_summary.csv'), index=False)
            
            # Print loading statistics
            total_files = len(summary_df)
            loaded_files = len(summary_df[summary_df['status'] == 'success'])
            missing_files = len(summary_df[summary_df['status'] == 'missing'])
            error_files = len(summary_df[summary_df['status'].str.startswith('error')])
            total_records = summary_df[summary_df['status'] == 'success']['records'].sum()
            
            print(f"\n=== FILE LOADING STATISTICS ===")
            print(f"Total files checked: {total_files}")
            print(f"Successfully loaded: {loaded_files}")
            print(f"Missing files: {missing_files}")
            print(f"Files with errors: {error_files}")
            print(f"Total records loaded: {total_records:,}")

def main():
    # Initialize analyzer
    analyzer = BLEFingerprinter()
    
    # Load and process data
    analyzer.load_and_process_data()
    
    # Perform analysis
    if not analyzer.df.empty:
        analyzer.analyze_and_visualize()
    else:
        print("No data loaded!")

if __name__ == "__main__":
    main()