#!/usr/bin/env python3
"""
Advanced BLE Analysis with Fingerprinting and Clustering
Modified for raw BLE data format
"""

import json
import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from collections import defaultdict, Counter
import warnings
warnings.filterwarnings('ignore')

# Configuration
DATA_DIR = "/var/www/html/CrowdFlow/CityMarket"  # Update this path
FILE_PREFIX = "cm"  # Update this to match your directory name prefix
EVENT_DATES = ["20250816"]  # Update these dates
RESULTS_DIR = "./analysis_results"

# Optional: specify hour range if event doesn't run 24 hours
START_HOUR = 8   # First hour to check (0-23)
END_HOUR = 16    # Last hour to check (0-23)

# Analysis parameters based on research
RSSI_BIN_WIDTH = 5  # dBm bins for RSSI histograms
MIN_RSSI_SAMPLES = 10  # Minimum RSSI readings for reliable fingerprint
IOS_ROTATION_WINDOW = 15  # minutes - iOS MAC rotation cycle
DBSCAN_EPS = 0.3  # DBSCAN epsilon for clustering
MIN_CLUSTER_SIZE = 2  # Minimum devices to form a cluster
CONFIDENCE_THRESHOLD = 0.75  # 75% confidence for cross-day linking
DWELL_TIME_THRESHOLD = 15  # minutes - gap before considering new visit

class BLEFingerprinter:
    def __init__(self):
        self.devices = {}
        self.fingerprints = {}
        self.clusters = {}
        self.cross_day_links = defaultdict(list)
        self.file_loading_log = []  # Track which files were loaded
        
        # Create results directory
        os.makedirs(RESULTS_DIR, exist_ok=True)
    
    def normalize_address(self, address):
        """Normalize MAC address format"""
        if isinstance(address, (int, float)):
            address = str(int(address))
        else:
            address = str(address)
        
        address = address.replace(':', '').upper()
        if len(address) < 12:
            address = address.zfill(12)
        return address[:12] if len(address) > 12 else address
    
    def classify_address(self, address):
        """Classify MAC address type based on research findings"""
        if len(address) != 12:
            return "Invalid"
        
        try:
            msb = int(address[:2], 16)
            
            # Check locally administered bit (bit 41)
            is_local = (msb & 0x02) != 0
            
            # Check multicast bit (bit 40) 
            is_multicast = (msb & 0x01) != 0
            
            if is_local and not is_multicast:
                # Further classify random addresses
                type_bits = (msb >> 6) & 0x03
                
                if type_bits == 0b11:
                    return "Random Static"
                elif type_bits == 0b01:
                    return "Random Private Resolvable"
                elif type_bits == 0b00:
                    return "Random Private Non-Resolvable"
                else:
                    return "Reserved"
            else:
                return "Public"
        except:
            return "Invalid"
    
    def aggregate_raw_records(self, raw_records):
        """Aggregate raw records into device sessions"""
        # Sort by timestamp
        raw_records.sort(key=lambda x: x['timestamp'])
        
        sessions = []
        current_session = None
        
        for record in raw_records:
            if current_session is None:
                # Start new session
                current_session = {
                    'firstSeenTime': record['timestamp'],
                    'lastSeenTime': record['timestamp'],
                    'rssi_values': [record['rssi']],
                    'locations': [record['Location']],
                    'timestamps': [record['timestamp']],
                    'state_changes': [record.get('state', 'unknown')]
                }
            else:
                # Check if this is part of the same session
                last_time = pd.to_datetime(current_session['lastSeenTime'])
                current_time = pd.to_datetime(record['timestamp'])
                time_gap = (current_time - last_time).total_seconds() / 60  # minutes
                
                if time_gap <= DWELL_TIME_THRESHOLD:
                    # Same session
                    current_session['lastSeenTime'] = record['timestamp']
                    current_session['rssi_values'].append(record['rssi'])
                    current_session['locations'].append(record['Location'])
                    current_session['timestamps'].append(record['timestamp'])
                    current_session['state_changes'].append(record.get('state', 'unknown'))
                else:
                    # New session
                    sessions.append(current_session)
                    current_session = {
                        'firstSeenTime': record['timestamp'],
                        'lastSeenTime': record['timestamp'],
                        'rssi_values': [record['rssi']],
                        'locations': [record['Location']],
                        'timestamps': [record['timestamp']],
                        'state_changes': [record.get('state', 'unknown')]
                    }
        
        # Don't forget the last session
        if current_session:
            sessions.append(current_session)
        
        # Calculate derived metrics for each session
        for session in sessions:
            first_time = pd.to_datetime(session['firstSeenTime'])
            last_time = pd.to_datetime(session['lastSeenTime'])
            session['dwellTime'] = (last_time - first_time).total_seconds() / 60  # minutes
            session['recordCount'] = len(session['rssi_values'])
            session['avgRssi'] = np.mean(session['rssi_values'])
            session['locationsVisited'] = list(set(session['locations']))
        
        return sessions
    
    def extract_features(self, device_data):
        """Extract multi-modal features for fingerprinting"""
        features = {}
        
        # RSSI fingerprint - create histogram
        if 'rssi_values' in device_data and len(device_data['rssi_values']) >= MIN_RSSI_SAMPLES:
            rssi_hist, _ = np.histogram(device_data['rssi_values'], 
                                       bins=range(-100, -40, RSSI_BIN_WIDTH))
            features['rssi_histogram'] = rssi_hist / len(device_data['rssi_values'])
            features['rssi_mean'] = np.mean(device_data['rssi_values'])
            features['rssi_std'] = np.std(device_data['rssi_values'])
            features['rssi_skew'] = stats.skew(device_data['rssi_values'])
        
        # Timing patterns
        if 'timestamps' in device_data and len(device_data['timestamps']) > 1:
            times = sorted(device_data['timestamps'])
            intervals = np.diff(times)
            if len(intervals) > 0:
                features['avg_interval'] = np.mean(intervals)
                features['interval_std'] = np.std(intervals)
                features['burst_ratio'] = np.sum(intervals < 1) / len(intervals)
        
        # Location patterns
        if 'locations' in device_data:
            loc_counts = Counter(device_data['locations'])
            total_visits = sum(loc_counts.values())
            features['location_entropy'] = stats.entropy(list(loc_counts.values()))
            features['primary_location_ratio'] = max(loc_counts.values()) / total_visits
            features['location_diversity'] = len(set(device_data['locations']))
        
        # Behavioral features
        features['total_dwell_time'] = device_data.get('total_dwell_time', 0)
        features['session_count'] = device_data.get('session_count', 0)
        features['total_records'] = device_data.get('total_records', 0)
        features['first_seen_hour'] = device_data.get('first_seen_hour', 0)
        
        return features
    
    def compute_fingerprint_similarity(self, fp1, fp2):
        """Compute similarity between two fingerprints"""
        similarity_scores = []
        
        # RSSI histogram similarity (if both have it)
        if 'rssi_histogram' in fp1 and 'rssi_histogram' in fp2:
            # Cosine similarity
            hist_sim = np.dot(fp1['rssi_histogram'], fp2['rssi_histogram']) / (
                np.linalg.norm(fp1['rssi_histogram']) * np.linalg.norm(fp2['rssi_histogram'])
            )
            similarity_scores.append(hist_sim * 2)  # Weight RSSI heavily
        
        # Behavioral similarity
        behavioral_features = ['total_dwell_time', 'location_entropy', 'primary_location_ratio']
        for feat in behavioral_features:
            if feat in fp1 and feat in fp2:
                # Normalized difference
                max_val = max(abs(fp1[feat]), abs(fp2[feat]), 1)
                sim = 1 - abs(fp1[feat] - fp2[feat]) / max_val
                similarity_scores.append(sim)
        
        # Timing similarity
        if 'avg_interval' in fp1 and 'avg_interval' in fp2:
            interval_sim = 1 - abs(fp1['avg_interval'] - fp2['avg_interval']) / max(
                fp1['avg_interval'], fp2['avg_interval'], 1
            )
            similarity_scores.append(interval_sim)
        
        return np.mean(similarity_scores) if similarity_scores else 0
    
    def cluster_devices_dbscan(self, date_devices):
        """Use DBSCAN to cluster devices based on fingerprints"""
        from sklearn.cluster import DBSCAN
        from sklearn.preprocessing import StandardScaler
        
        # Extract feature matrix
        addresses = list(date_devices.keys())
        feature_matrix = []
        
        for addr in addresses:
            fp = self.fingerprints.get(addr, {})
            # Create feature vector
            features = [
                fp.get('rssi_mean', -80),
                fp.get('rssi_std', 10),
                fp.get('total_dwell_time', 0),
                fp.get('location_entropy', 0),
                fp.get('primary_location_ratio', 1),
                fp.get('avg_interval', 60),
                fp.get('first_seen_hour', 12)
            ]
            feature_matrix.append(features)
        
        if len(feature_matrix) < MIN_CLUSTER_SIZE:
            return {}
        
        # Standardize features
        scaler = StandardScaler()
        X = scaler.fit_transform(feature_matrix)
        
        # Apply DBSCAN
        clustering = DBSCAN(eps=DBSCAN_EPS, min_samples=MIN_CLUSTER_SIZE)
        labels = clustering.fit_predict(X)
        
        # Group by cluster
        clusters = defaultdict(list)
        for i, label in enumerate(labels):
            if label != -1:  # Not noise
                clusters[label].append(addresses[i])
        
        return dict(clusters)
    
    def analyze_ios_rotation_windows(self, df):
        """Analyze 15-minute windows for iOS rotation patterns"""
        # Create 15-minute bins
        df['time_bin'] = pd.to_datetime(df['timestamp']).dt.floor(f'{IOS_ROTATION_WINDOW}min')
        
        rotation_analysis = []
        
        for time_bin, group in df.groupby('time_bin'):
            addresses = group['address'].unique()
            
            # Look for similar RSSI patterns in this window
            rssi_patterns = {}
            for _, row in group.iterrows():
                addr = row['address']
                if addr not in rssi_patterns:
                    rssi_patterns[addr] = []
                rssi_patterns[addr].append(row['rssi'])
            
            # Find potential rotations (similar RSSI in same window)
            potential_rotations = []
            addrs = list(rssi_patterns.keys())
            
            for i in range(len(addrs)):
                for j in range(i+1, len(addrs)):
                    if len(rssi_patterns[addrs[i]]) > 0 and len(rssi_patterns[addrs[j]]) > 0:
                        rssi_diff = abs(np.mean(rssi_patterns[addrs[i]]) - 
                                      np.mean(rssi_patterns[addrs[j]]))
                        if rssi_diff < 5:  # Within 5 dBm
                            potential_rotations.append((addrs[i], addrs[j], rssi_diff))
            
            rotation_analysis.append({
                'time_bin': time_bin,
                'unique_addresses': len(addresses),
                'potential_rotations': len(potential_rotations)
            })
        
        return pd.DataFrame(rotation_analysis)
    
    def probabilistic_cross_day_linking(self, day1_data, day2_data):
        """Link devices across days using probabilistic matching"""
        links = []
        
        for addr1, fp1 in day1_data.items():
            best_match = None
            best_score = 0
            
            for addr2, fp2 in day2_data.items():
                # Skip if same address (unlikely with randomization)
                if addr1 == addr2:
                    continue
                
                # Compute similarity
                similarity = self.compute_fingerprint_similarity(fp1, fp2)
                
                if similarity > best_score and similarity >= CONFIDENCE_THRESHOLD:
                    best_score = similarity
                    best_match = addr2
            
            if best_match:
                links.append({
                    'day1_address': addr1,
                    'day2_address': best_match,
                    'confidence': best_score,
                    'day1_type': self.classify_address(addr1),
                    'day2_type': self.classify_address(addr2)
                })
        
        return links
    
    def load_and_process_data(self):
        """Load data and create fingerprints"""
        all_data = []
        
        for date in EVENT_DATES:
            print(f"\nProcessing date: {date}")
            
            # Collect all data for this date across all hours
            date_raw_data = []
            hours_found = []
            
            # Try to load hourly files
            for hour in range(START_HOUR, END_HOUR + 1):
                hour_str = f"{hour:02d}"  # Format as 00, 01, ..., 23
                
                # Build filename for this hour
                # Using the configured prefix
                filename = os.path.join(DATA_DIR, f"{FILE_PREFIX}_combined_flat_{date}_{hour_str}.json")
                
                if os.path.exists(filename):
                    hours_found.append(hour_str)
                    try:
                        with open(filename, 'r') as f:
                            hour_data = json.load(f)
                            date_raw_data.extend(hour_data)
                            print(f"  ✓ Loaded hour {hour_str}: {len(hour_data)} records")
                            
                            # Track loading info
                            self.file_loading_log.append({
                                'date': date,
                                'hour': hour_str,
                                'filename': filename,
                                'exists': True,
                                'records': len(hour_data),
                                'status': 'success'
                            })
                    except Exception as e:
                        print(f"  ✗ Error loading hour {hour_str}: {e}")
                        self.file_loading_log.append({
                            'date': date,
                            'hour': hour_str,
                            'filename': filename,
                            'exists': True,
                            'records': 0,
                            'status': f'error: {str(e)}'
                        })
                else:
                    self.file_loading_log.append({
                        'date': date,
                        'hour': hour_str,
                        'filename': filename,
                        'exists': False,
                        'records': 0,
                        'status': 'missing'
                    })
                
            if not date_raw_data:
                print(f"  ⚠️  No data found for {date}")
                continue
            
            print(f"  Total records for {date}: {len(date_raw_data)} from hours {', '.join(hours_found)}")
            
            # Show hourly distribution
            hour_distribution = defaultdict(int)
            for record in date_raw_data:
                try:
                    ts = pd.to_datetime(record.get('timestamp'))
                    hour_distribution[ts.hour] += 1
                except:
                    pass
            
            if hour_distribution:
                print(f"  Hourly distribution: {dict(sorted(hour_distribution.items()))}")
            
            # Group raw records by device address
            raw_by_device = defaultdict(list)
            
            for record in date_raw_data:
                address = self.normalize_address(record.get('address', ''))
                if not address:
                    continue
                
                # Ensure timestamp is properly formatted
                if 'timestamp' in record:
                    try:
                        # Parse and reformat timestamp if needed
                        ts = pd.to_datetime(record['timestamp'])
                        record['timestamp'] = ts
                    except:
                        continue
                
                raw_by_device[address].append(record)
            
            # Process each device's raw records
            date_devices = {}
            
            for address, raw_records in raw_by_device.items():
                # Aggregate raw records into sessions
                sessions = self.aggregate_raw_records(raw_records)
                
                if not sessions:
                    continue
                
                # Create device data structure
                date_devices[address] = {
                    'sessions': sessions,
                    'rssi_values': [],
                    'timestamps': [],
                    'locations': [],
                    'address': address,
                    'date': date,
                    'total_dwell_time': sum(s['dwellTime'] for s in sessions),
                    'session_count': len(sessions),
                    'total_records': sum(s['recordCount'] for s in sessions)
                }
                
                # Aggregate all values across sessions
                for session in sessions:
                    date_devices[address]['rssi_values'].extend(session['rssi_values'])
                    date_devices[address]['locations'].extend(session['locations'])
                    
                    # Convert timestamps to float for processing
                    for ts in session['timestamps']:
                        date_devices[address]['timestamps'].append(
                            pd.to_datetime(ts).timestamp()
                        )
                
                # Get first seen hour
                first_ts = pd.to_datetime(sessions[0]['firstSeenTime'])
                date_devices[address]['first_seen_hour'] = first_ts.hour
                
                # Add aggregated sessions to DataFrame format
                for session in sessions:
                    session_record = {
                        'date': date,
                        'address': address,
                        'address_normalized': address,
                        'address_type': self.classify_address(address),
                        'firstSeenTime': session['firstSeenTime'],
                        'lastSeenTime': session['lastSeenTime'],
                        'dwellTime': session['dwellTime'],
                        'recordCount': session['recordCount'],
                        'avgRssi': session['avgRssi'],
                        'locationsVisited': session['locationsVisited']
                    }
                    all_data.append(session_record)
            
            # Create fingerprints
            print(f"Creating fingerprints for {len(date_devices)} devices...")
            for addr, device_data in date_devices.items():
                self.fingerprints[addr] = self.extract_features(device_data)
            
            # Cluster devices
            print("Clustering devices...")
            clusters = self.cluster_devices_dbscan(date_devices)
            self.clusters[date] = clusters
            print(f"Found {len(clusters)} clusters")
            
            # Store for cross-day analysis
            self.devices[date] = date_devices
        
        # Create DataFrame
        self.df = pd.DataFrame(all_data)
        if not self.df.empty:
            self.df['firstSeenTime'] = pd.to_datetime(self.df['firstSeenTime'])
            self.df['lastSeenTime'] = pd.to_datetime(self.df['lastSeenTime'])
            
            # Print loading summary
            print("\n=== DATA LOADING SUMMARY ===")
            print(f"Total dates processed: {len(self.devices)}")
            print(f"Total unique devices: {len(self.fingerprints)}")
            print(f"Total sessions: {len(self.df)}")
            print(f"Date range: {self.df['date'].min()} to {self.df['date'].max()}")
            print(f"Hour range checked: {START_HOUR:02d}:00 to {END_HOUR:02d}:59")
    
    def analyze_and_visualize(self):
        """Perform analysis and create visualizations"""
        print("\n=== ADVANCED BLE ANALYSIS RESULTS ===")
        
        # 1. Address type distribution
        plt.figure(figsize=(12, 6))
        
        plt.subplot(1, 2, 1)
        type_counts = self.df['address_type'].value_counts()
        type_counts.plot(kind='bar')
        plt.title('Address Type Distribution')
        plt.xticks(rotation=45)
        plt.tight_layout()
        
        # 2. RSSI fingerprint visualization
        plt.subplot(1, 2, 2)
        rssi_data = []
        for fp in self.fingerprints.values():
            if 'rssi_mean' in fp:
                rssi_data.append([fp['rssi_mean'], fp.get('rssi_std', 0)])
        
        if rssi_data:
            rssi_df = pd.DataFrame(rssi_data, columns=['RSSI Mean', 'RSSI Std'])
            plt.scatter(rssi_df['RSSI Mean'], rssi_df['RSSI Std'], alpha=0.5)
            plt.xlabel('RSSI Mean (dBm)')
            plt.ylabel('RSSI Std Dev')
            plt.title('RSSI Fingerprint Distribution')
        
        plt.savefig(os.path.join(RESULTS_DIR, 'address_analysis.png'))
        plt.close()
        
        # 3. iOS rotation window analysis
        print("\n--- iOS Rotation Window Analysis ---")
        for date in EVENT_DATES:
            if date in self.devices:
                date_df = self.df[self.df['date'] == date]
                if not date_df.empty:
                    # Need to create a raw format DataFrame for rotation analysis
                    raw_df_data = []
                    for addr, device in self.devices[date].items():
                        for session in device.get('sessions', []):
                            for i, ts in enumerate(session['timestamps']):
                                raw_df_data.append({
                                    'timestamp': ts,
                                    'address': addr,
                                    'rssi': session['rssi_values'][i]
                                })
                    
                    if raw_df_data:
                        raw_df = pd.DataFrame(raw_df_data)
                        rotation_df = self.analyze_ios_rotation_windows(raw_df)
                        
                        avg_addresses = rotation_df['unique_addresses'].mean()
                        avg_rotations = rotation_df['potential_rotations'].mean()
                        
                        print(f"{date}: Avg {avg_addresses:.1f} addresses per 15min, "
                              f"{avg_rotations:.1f} potential rotations")
        
        # 4. Cross-day linking
        print("\n--- Cross-Day Probabilistic Linking ---")
        dates = sorted(self.devices.keys())
        
        cross_day_summary = []
        for i in range(len(dates)-1):
            date1, date2 = dates[i], dates[i+1]
            
            # Get fingerprints for each day
            day1_fps = {addr: self.fingerprints[addr] 
                       for addr in self.devices[date1].keys() 
                       if addr in self.fingerprints}
            day2_fps = {addr: self.fingerprints[addr] 
                       for addr in self.devices[date2].keys() 
                       if addr in self.fingerprints}
            
            # Perform linking
            links = self.probabilistic_cross_day_linking(day1_fps, day2_fps)
            
            if links:
                link_df = pd.DataFrame(links)
                high_conf_links = link_df[link_df['confidence'] >= 0.8]
                
                print(f"{date1} → {date2}: {len(links)} total links, "
                      f"{len(high_conf_links)} high confidence (≥80%)")
                
                cross_day_summary.append({
                    'day_pair': f"{date1}-{date2}",
                    'total_links': len(links),
                    'high_conf_links': len(high_conf_links),
                    'avg_confidence': link_df['confidence'].mean()
                })
        
        # 5. Clustering effectiveness
        print("\n--- Clustering Analysis ---")
        cluster_summary = []
        
        for date, clusters in self.clusters.items():
            if clusters:
                cluster_sizes = [len(addrs) for addrs in clusters.values()]
                cluster_summary.append({
                    'date': date,
                    'num_clusters': len(clusters),
                    'avg_cluster_size': np.mean(cluster_sizes),
                    'max_cluster_size': max(cluster_sizes),
                    'clustered_devices': sum(cluster_sizes)
                })
        
        if cluster_summary:
            cluster_df = pd.DataFrame(cluster_summary)
            print(cluster_df)
        
        # 6. Attendance estimation
        print("\n--- Attendance Estimation ---")
        
        # Method 1: Raw unique count (overestimate)
        daily_unique = self.df.groupby('date')['address_normalized'].nunique()
        print(f"\nRaw unique devices per day:")
        for date, count in daily_unique.items():
            print(f"  {date}: {count:,}")
        
        # Method 2: Cluster-based estimate
        cluster_based_estimate = {}
        for date, clusters in self.clusters.items():
            # Count clusters + unclustered devices
            clustered = sum(len(addrs) for addrs in clusters.values())
            total = len(self.devices.get(date, {}))
            unclustered = total - clustered
            estimated = len(clusters) + unclustered
            cluster_based_estimate[date] = estimated
            print(f"\nCluster-based estimate for {date}: {estimated:,}")
        
        # Method 3: Cross-day adjusted
        if cross_day_summary:
            avg_retention = np.mean([s['high_conf_links'] for s in cross_day_summary])
            total_adjusted = int(sum(daily_unique.values()) * 0.7)  # Assume 30% are same devices
            print(f"\nCross-day adjusted total estimate: {total_adjusted:,}")
        
        # 7. Behavioral patterns
        plt.figure(figsize=(15, 10))
        
        # Dwell time distribution by address type
        plt.subplot(2, 2, 1)
        for addr_type in self.df['address_type'].unique():
            type_data = self.df[self.df['address_type'] == addr_type]['dwellTime']
            if len(type_data) > 10:
                plt.hist(type_data, bins=30, alpha=0.5, label=addr_type)
        plt.xlabel('Dwell Time (minutes)')
        plt.ylabel('Count')
        plt.title('Dwell Time Distribution by Address Type')
        plt.legend()
        plt.xlim(0, 300)
        
        # Location patterns
        plt.subplot(2, 2, 2)
        location_counts = Counter()
        for locs in self.df['locationsVisited']:
            if isinstance(locs, list):
                location_counts.update(locs)
        
        if location_counts:
            locs, counts = zip(*location_counts.most_common(10))
            plt.bar(locs, counts)
            plt.xlabel('Location')
            plt.ylabel('Visit Count')
            plt.title('Top 10 Visited Locations')
            plt.xticks(rotation=45)
        
        # RSSI patterns over time
        plt.subplot(2, 2, 3)
        hourly_rssi = self.df.groupby(self.df['firstSeenTime'].dt.hour)['avgRssi'].mean()
        hourly_rssi.plot(marker='o')
        plt.xlabel('Hour of Day')
        plt.ylabel('Average RSSI (dBm)')
        plt.title('Average RSSI by Hour')
        plt.grid(True)
        
        # Device count by hour
        plt.subplot(2, 2, 4)
        hourly_counts = self.df.groupby(self.df['firstSeenTime'].dt.hour)['address_normalized'].nunique()
        hourly_counts.plot(kind='bar')
        plt.xlabel('Hour of Day')
        plt.ylabel('Unique Devices')
        plt.title('Unique Devices by Hour')
        
        plt.tight_layout()
        plt.savefig(os.path.join(RESULTS_DIR, 'behavioral_analysis.png'))
        plt.close()
        
        # 8. Data coverage heatmap
        self.create_coverage_heatmap()
        
        # Save detailed results
        self.save_results()
    
    def create_coverage_heatmap(self):
        """Create a heatmap showing data coverage by date and hour"""
        # Create a matrix of dates x hours
        hours_in_range = END_HOUR - START_HOUR + 1
        coverage_matrix = np.zeros((len(EVENT_DATES), hours_in_range))
        
        for i, date in enumerate(EVENT_DATES):
            if date in self.devices:
                # Count records per hour for this date
                date_df = self.df[self.df['date'] == date]
                for j, hour in enumerate(range(START_HOUR, END_HOUR + 1)):
                    hour_count = len(date_df[date_df['firstSeenTime'].dt.hour == hour])
                    coverage_matrix[i, j] = hour_count
        
        # Create heatmap
        plt.figure(figsize=(12, 6))
        sns.heatmap(coverage_matrix, 
                    xticklabels=range(START_HOUR, END_HOUR + 1),
                    yticklabels=EVENT_DATES,
                    cmap='YlOrRd',
                    annot=True,
                    fmt='.0f',
                    cbar_kws={'label': 'Number of Sessions'})
        plt.xlabel('Hour of Day')
        plt.ylabel('Date')
        plt.title('Data Coverage Heatmap - Sessions by Date and Hour')
        plt.tight_layout()
        plt.savefig(os.path.join(RESULTS_DIR, 'data_coverage_heatmap.png'))
        plt.close()
    
    def save_results(self):
        """Save analysis results to files"""
        # Save fingerprints
        fingerprint_data = []
        for addr, fp in self.fingerprints.items():
            fp_record = {'address': addr, **fp}
            fingerprint_data.append(fp_record)
        
        fp_df = pd.DataFrame(fingerprint_data)
        fp_df.to_csv(os.path.join(RESULTS_DIR, 'device_fingerprints.csv'), index=False)
        
        # Save clusters
        cluster_data = []
        for date, clusters in self.clusters.items():
            for cluster_id, addresses in clusters.items():
                for addr in addresses:
                    cluster_data.append({
                        'date': date,
                        'cluster_id': cluster_id,
                        'address': addr
                    })
        
        if cluster_data:
            cluster_df = pd.DataFrame(cluster_data)
            cluster_df.to_csv(os.path.join(RESULTS_DIR, 'device_clusters.csv'), index=False)
        
        # Save session summary
        session_data = []
        for date, devices in self.devices.items():
            for addr, device in devices.items():
                for session in device.get('sessions', []):
                    session_data.append({
                        'date': date,
                        'address': addr,
                        'firstSeenTime': session['firstSeenTime'],
                        'lastSeenTime': session['lastSeenTime'],
                        'dwellTime': session['dwellTime'],
                        'recordCount': session['recordCount'],
                        'avgRssi': session['avgRssi'],
                        'locations': ','.join(session['locationsVisited'])
                    })
        
        if session_data:
            session_df = pd.DataFrame(session_data)
            session_df.to_csv(os.path.join(RESULTS_DIR, 'device_sessions.csv'), index=False)
        
        # Save file loading summary
        self.save_loading_summary()
        
        print(f"\nResults saved to {RESULTS_DIR}/")
    
    def save_loading_summary(self):
        """Save a summary of which hourly files were loaded"""
        if self.file_loading_log:
            summary_df = pd.DataFrame(self.file_loading_log)
            summary_df.to_csv(os.path.join(RESULTS_DIR, 'file_loading_summary.csv'), index=False)
            
            # Print loading statistics
            total_files = len(summary_df)
            loaded_files = len(summary_df[summary_df['status'] == 'success'])
            missing_files = len(summary_df[summary_df['status'] == 'missing'])
            error_files = len(summary_df[summary_df['status'].str.startswith('error')])
            total_records = summary_df[summary_df['status'] == 'success']['records'].sum()
            
            print(f"\n=== FILE LOADING STATISTICS ===")
            print(f"Total files checked: {total_files}")
            print(f"Successfully loaded: {loaded_files}")
            print(f"Missing files: {missing_files}")
            print(f"Files with errors: {error_files}")
            print(f"Total records loaded: {total_records:,}")

def main():
    # Initialize analyzer
    analyzer = BLEFingerprinter()
    
    # Load and process data
    analyzer.load_and_process_data()
    
    # Perform analysis
    if not analyzer.df.empty:
        analyzer.analyze_and_visualize()
    else:
        print("No data loaded!")

if __name__ == "__main__":
    main()