U
    9hݗ                     @   s   d Z ddlZddlmZ ddlZddlmZmZ ddl	m
Z
mZ ddlmZ ddlmZmZmZ dd	lmZmZmZmZmZmZmZ d$ddZd%ddZd&ddZd'ddZdd Zdd Zdd Z d(ddZ!d)ddZ"dd Z#d d! Z$d"d# Z%dS )*a  flat.py

Provides alternative functions to hdbscan.HDBSCAN and others to
1. Allow prediction on a flat clustering by specifying 'n_clusters'.
    This is done by choosing the best cluster_selection_epsilon that produces
    the required number of clusters without adding unnecessary outliers.
2. Makes approximate_predict, membership_vector, and
    all_points_membership_vectors consistent with cluster_selection_epsilon

Provides the following functions:
==================================
HDBSCAN_flat: trained HDBSCAN instance with 'n_clusters' clusters
    The attributes (labels, probabilities, prediction_data) are tuned to
    produce 'n_clusters' clusters.

approximate_predict_flat: labels and probabilities for novel points
    Allows selecting n_clusters for novel points, or using the
    original clustering (potentially specified using cluster_selection_epsilon)

membership_vector_flat: Soft-clustering probabilities for novel points
    Similar to approximate_predict_flat, but for soft-clustering.
    **Use with caution**

all_points_membership_vectors_flat: Soft-clustering probabilities
    Similar to membership_vector_flat, but for points in training set
    **Use with caution**
    N)warn   )compute_stabilityget_cluster_tree_leaves)HDBSCAN_tree_to_labels)_bfs_from_cluster_tree)PredictionData_find_cluster_and_probability_find_neighbor_and_lambda)get_tree_row_with_childdist_membership_vectoroutlier_membership_vectorprob_in_some_cluster!all_points_dist_membership_vector$all_points_outlier_membership_vectorall_points_prob_in_some_cluster        Fc              	   K   s  |dkrJ|dkrJt |tr|s2d|d< tf |}n
|}d|_||  |S |dk	rn|dkrntd| d d}t |tsd|d< tf |}||  n*|r|}n
t|}d|_d|_||  |jdkrt|j	
 }|dk	r|jdkr||krtd| d	 d
|_t|j	|}n|}t||_|j}	|	 }	|j}
|j}|j}d}td|	|
||||d}|\|_|_|_|_|_t|j|j	|d |S )a  
    Train a HDBSCAN clusterer by specifying n_clusters.
    Or, modify a trained clusterer to return specific n_clusters.

    Parameters
    ----------
    X: array-like
        Data to be passed to HDBSCAN for training.

    n_clusters: int, default=None
        Number of clusters to produce.
        If None, revert to default HDBSCAN

    cluster_selection_epsilon: float, default=0.
        core-distance below which to stop splitting clusters.
        This can indirectly impose n_clusters.
        This argument is ignored if n_clusters is supplied.

    clusterer: HDBSCAN, default=None
        If supplied, modify this clusterer to produce n_clusters clusters.

    inplace: bool, default=False
        If 'clusterer' parameter is supplied, and inplace is True,
            modify the previous clusterer inplace.
            If False, return a modified copy of the previous clusterer.

    **kwargs: keyword arguments
        All init arguments for HDBSCAN

    Returns
    -------
    new_clusterer: HDBSCAN
        New HDBSCAN instance; returned irrespective of inplace=True or False

    Usage
    -----
    # Extract flat clustering from HDBSCAN's hierarchy for 7 clusters
    clusterer = HDBSCAN_flat(X_train, n_clusters=7,
                             min_cluster_size=12, min_samples=8)
    labels = clusterer.labels_
    proba = clusterer.probabilities_

    # Use a previously initialized/trained HDBSCAN
    old_clusterer = HDBSCAN(min_cluster_size=12, min_samples=8)
    clusterer = HDBSCAN_flat(X_train, n_clusters=7,
                             clusterer=old_clusterer, inplace=True)
    labels = clusterer.labels_
    proba = clusterer.probabilities_

    See Also
    ---------
    :py:func:`hdbscan.HDBSCAN`
    :py:func:`re_init`
    Nr   Tprediction_dataz'cluster_selection_epsilon' (=z+) is ignored when 'n_clusters' is supplied.eomCannot predict more than B with cluster selection method 'eom'. Changing to method 'leaf'...leafFcluster_selection_epsilon)
isinstancer   r   fitr   copydeepcopycluster_selection_methodlencondensed_tree__select_clustersselect_epsilonfloatr   Zsingle_linkage_tree_Zto_numpymin_cluster_sizeallow_single_clusterr   labels_Zprobabilities_Zcluster_persistence_Z_condensed_treeZ_single_linkage_treere_initprediction_data_)X
n_clustersr   	clustererZinplacekwargsZnew_clustererZmax_eom_clustersepsilonZsingle_linkage_treer%   r   r&   match_reference_implementationoutput r1   _/var/www/html/CrowdFlow/HYROX/ble_analysis_env_py38/lib/python3.8/site-packages/hdbscan/flat.pyHDBSCAN_flat0   sx    :







 r3   c              
   C   s  t t | jdk}|dk	r&t|}| j}|dkrR|dksD||krR|dkrR| j}t|ts| jdkrnt	d| j}t
|}|jdkr|dk	r||krtd| d d|_t||||d t |}|jd	 |jjd	 krt	d
|jjd dkrHtd dt j|jd t jd }t j|jd t jd}	|r@||	|fS ||	fS t j|jd t jd}t j|jd t jd}	| jp| j}
|jj|d|
 d\}}t|jd D ]B}t||j|| || |j|j |j!|
\}}}|||< ||	|< q|r||	|fS ||	fS dS )a
  
    Predict the cluster label of new points at a particular flat clustering,
        specified by n_clusters. This is a modified version of
        hdbscan.approximate_predict to allow selection of n_clusters.

    Parameters
    ----------
    clusterer : HDBSCAN
        A clustering object that has been fit to the data and
        either had ``prediction_data=True`` set, or called the
        ``generate_prediction_data`` method after the fact.

    points_to_predict : array, or array-like (n_samples, n_features)
        The new data points to predict cluster labels for. They should
        have the same dimensionality as the original dataset over which
        clusterer was fit.

    n_clusters: int, default=None
        The number of clusters to have in the flat clustering
            (over the training data, not points_to_predict)
        Ignored when prediction_data is supplied.

    cluster_selection_epsilon: float, default=None
        core-distance below which to stop splitting clusters.
        This can indirectly impose n_clusters.
        This argument is ignored if n_clusters is supplied.

    prediction_data: PredictionData, default=None
        If supplied, use this to predict clusters for points_to_predict.
        This allows predicting on multiple datasets without corrupting
            prediction data associated with clusterer.

        If neither n_clusters, nor prediction_data are supplied,
            then the prediction_data associated with clusterer is used.

    return_prediction_data: bool, default=False
        If True, return prediction_data along with labels and proba.

    Returns
    -------
    labels : array (n_samples,)
        The predicted labels of the ``points_to_predict``

    probabilities : array (n_samples,)
        The soft cluster scores for each of the ``points_to_predict``

    prediction_data: PredictionData, optional
        prediction_data used to predict.
        Returned if return_prediciton_data is set to True.


    Usage
    -----
    # From a fitted HDBSCAN model, predict for n_clusters=5
    labels, proba = approximate_predict_flat(
                        clusterer, X_predict, n_clusters=5)

    # Store prediciton data for later use.
    labels, proba, pred_data = approximate_predict_flat(
                                    clusterer, X_predict, n_clusters=5,
                                    return_prediction_data=True)
    # and use this prediction data to predict on new points
    labels1, proba1 = approximate_predict_flat(
                                    clusterer, X_pred1,
                                    prediction_data=pred_data)

    See Also
    ---------
    :py:func:`hdbscan.prediction.approximate_predict`
    r   NzClusterer does not have prediction data! Try fitting with prediction_data=True set, or run generate_prediction_data on the clustererr   r   r   r   )r+   r   r   z-New points dimension does not match fit data!zfPrediction data does not have any defined clusters, new data will be automatically predicted as noise.Zdtype   k)"npsumuniquer'   intr!   r)   r   r	   
ValueErrorr   r   r   r   r(   Zasarrayshaperaw_datacluster_treeZonesZint32zerosZfloat32emptyfloat64min_samplesr%   treequeryranger
   core_distancescluster_mapmax_lambdas)r,   points_to_predictr+   r   r   Zreturn_prediction_dataZn_clusters_fitcondensed_treelabelsZprobabilitiesrD   neighbor_distancesneighbor_indicesilabelZprobZ	neighborsr1   r1   r2   approximate_predict_flat   s~    M









rR   c                 C   s  | tj}| j}|dkr8|dkr8|dkr8| j}| j}nt|dkr|dk	rzt||}W q tk
r   td| d d|_	t||}Y qX n|}t
| j}t|||d t|}tj|jd |jd ftjd}| jp| j}	|jj|d	|	 d
\}
}t|jd D ]}t|| |
| |j|	\}}t|j|}|d |krH|d }t|| |j|j}t||||j|j|j}|d |d  ||< ||  ||    < ||  t ||||j|j|j9  < q|}|S )a   
    (Adaptation of hdbscan's membership_vector for n_clusters, epsilon)
    Predict soft cluster membership probabilities;
        a vector for each point in ``points_to_predict`` that gives
        a probability that the given point is a member of a cluster
        for each of the selected clusters of the ``clusterer``.

    Parameters
    ----------
    clusterer: HDBSCAN
        A clustering object that has been fit to the data and
        either had ``prediction_data=True`` set, or called the
        ``generate_prediction_data`` method after the fact.

    points_to_predict: array, or array-like (n_samples, n_features)
        The new data points to predict cluster labels for. They should
        have the same dimensionality as the original dataset over which
        clusterer was fit.

    prediction_data: PredictionData, default=None
        Prediction data associated with HDBSCAN for some flat clustering

    n_clusters: int, default=None
        Number of clusters over which to compute membership probabilities.
        These clusters are obtained as a flat clustering at some
            cluster_selection_epsilon.

    cluster_selection_epsilon: float, default=0.
        core-distance below which to stop splitting clusters.
        This can indirectly impose n_clusters.
        This argument is ignored if n_clusters is supplied.

    Note: If neither n_clusters nor cluster_selection_epsilon are supplied,
        the clusterer's original clustering is used.

    Returns
    -------
    membership_vectors : array (n_samples, n_clusters)
        The probability that point ``i`` is a member of cluster ``j`` is
        in ``membership_vectors[i, j]``.

    See Also
    --------
    :py:func:`hdbscan.predict.membership_vector`
    :py:func:`hdbscan.predict.all_points_membership_vectors`
    Nr   Failed to predict E clusters with cluster selection method 'eom'. Switching to 'leaf'...r   r   r   r5   r6   r7   
lambda_valg      ?g       @)!astyper9   rC   r!   r   r)   r#   AssertionErrorr   r   r   r   r(   clusters_from_prediction_datarB   r>   rD   r%   rE   rF   rG   r   rH   r   	_raw_treer   	exemplarsdist_metricr   leaf_max_lambdasr@   r:   r   )r,   rK   r   r+   r   rL   r.   clustersresultrD   rN   rO   rP   Znearest_neighborZlambda_Zneighbor_tree_rowZdistance_vecZoutlier_vecmembership_vectorsr1   r1   r2   membership_vector_flatj  s    2	 	
	r`   c                 C   s@  | j }|dkr$|dkr$| j}| j}nt|dkr|dk	rzzt||}W q~ tk
rv   td| d d|_t||}Y q~X n|}t| j}t	|||d t
|}|j}|jdkrt|jd S t||j|j}t||j|j|j}	t||j|j|j}
||	 }|jdd}||ddtjf  }||
ddtjf 9 }|}|S )	a  
    (Adaptation of hdbscan's all_points_membership_vector
        for n_clusters, epsilon)
    Predict soft cluster membership vectors for all points in the
    original dataset the clusterer was trained on. This function is more
    efficient by making use of the fact that all points are already in the
    condensed tree, and processing in bulk.

    Parameters
    ----------
    clusterer : HDBSCAN
         A clustering object that has been fit to the data and
        either had ``prediction_data=True`` set, or called the
        ``generate_prediction_data`` method after the fact.
        This method does not work if the clusterer was trained
        with ``metric='precomputed'``.

    prediction_data: PredictionData, default=None
        Prediction data associated with HDBSCAN for some flat clustering

    n_clusters: int, optional, default=None
        Number of clusters over which to compute membership probabilities.
        These clusters are obtained as a flat clustering at some
            cluster_selection_epsilon.

    cluster_selection_epsilon: float, optional, default=None
        core-distance below which to stop splitting clusters.
        This can indirectly impose n_clusters.
        This argument is ignored if n_clusters is supplied.

    Note: If neither n_clusters nor cluster_selection_epsilon are supplied,
        the clusterer's original clustering is used.

    Returns
    -------
    membership_vectors : array (n_samples, n_clusters)
        The probability that point ``i`` of the original dataset is a member of
        cluster ``j`` is in ``membership_vectors[i, j]``.
    See Also
    --------
    :py:func:`hdbscan.prediction.all_points_membership_vectors`
    :py:func:`hdbscan.prediction.membership_vector`
    NrS   rT   r   r   r   r   )Zaxis)r!   r   r)   r#   rW   r   r   r   r   r(   rX   r?   sizer9   rA   r>   r   rZ   r[   r   rY   r\   r@   r   r:   Znewaxis)r,   r   r+   r   rL   r.   r]   Z
all_pointsZdistance_vecsZoutlier_vecsZin_cluster_probsr^   Zrow_sumsr_   r1   r1   r2   "all_points_membership_vectors_flat  sX    /
rb   c                 C   s6   | j }|dkrt| |S |dkr*t| |S tddS )z
    Pick optimal epsilon from condensed tree based on n_clusters,
        calls functions specific to 'eom' or 'leaf' selection methods
    r   r   EInvalid Cluster Selection Method: %s
Should be one of: "eom", "leaf"
N)r   select_epsilon_eomselect_epsilon_leafr=   )rL   r+   r   r1   r1   r2   r#   z  s    

r#   c           	      C   s   |   }t|}||ks,td| dd | j}|d |d dk }dt| d }t|d	d	d
 }|D ]}t| |}t||krp qqptd|S )z
    Select epsilon so that persistence-based clustering,
        after truncating the tree at the above epsilon,
        has exactly 'n_clusters' clusters
    zCannot produce more than z with method 'eom'. z5Use method 'leaf' instead to extract flat clustering.rU   
child_sizer         ?-q=Nr4   zCould not find epsilon)	r"   r    rW   rY   r9   r;   sort_new_select_clustersRuntimeError)	rL   r+   Zeom_base_clustersZmax_clustersrE   Zcluster_lambdasZcandidate_epsilonsr.   Zsel_clustersr1   r1   r2   rd     s"    


rd   c                 C   s   | j d }| j d }|t}||dk }t|t}|t|d kr|tdt|d  dt|d  d t|d }d||d   }|d	 S )
z
    Select epsilon so that the leaves of condensed tree,
        after truncating at the above epsilon,
        has exactly 'n_clusters' clusters
    rU   rf   r   zHDBSCAN can only compute z! clusters. Setting n_clusters to z...rg   r6   rh   )rY   rV   r<   r9   r;   r$   r    r   )rL   r+   ZlambdasZchild_sizesr.   r1   r1   r2   re     s    	


&re   c                 C   s4  |dk	rt ||}t||}|j}dd ttt|D | _dd | j D | _i | _	g | _
|D ]}|d |d |k  | j	|< | |D ]$}| j| | j|< | j	| | j	|< qtjg tjd}| |D ]J}	|d |d |	k  }
|d |d |	k|d |
k@  }t||g}q| j
| j|  qddS )	ap  
    Modify PredictionData of HDBSCAN to account for epsilon.
    epsilon is the cluster_selection_epsilon that controls granularity
        of clusters; Large epsilon => More clusters

    Parameters
    ----------
    predData: PredictionData
        Contains data to use for predicting novel points.
        Defined in the HDBSCAN module

    condensed_tree: CondensedTree
        Tree structure that contains hierarchical clustering.
        Defined in the HDBSCAN module

    n_clusters: int, optional, default=None
        If specified, use this to obtain cluster_selection_epsilon
            from CondensedTree; Overrides cluster_selection_epsilon parameter

    cluster_selection_epsilon: float, default=0.
        In cluster tree, nodes are not split further beyond (>=) this value.
        epsilon is the inverse of core distance.

    Returns
    -------
    None
    Nc                 S   s   i | ]\}}t ||qS r1   r<   ).0ncr1   r1   r2   
<dictcomp>  s      zre_init.<locals>.<dictcomp>c                 S   s   i | ]\}}||qS r1   r1   )rm   ro   rn   r1   r1   r2   rp     s      rU   parentr5   child)r#   rj   rY   	enumeratesortedlistrI   itemsreverse_cluster_maprJ   rZ   maxZ_clusters_belowr9   arrayZint64Z_recurse_leaf_dfsZhstackappendr?   )ZpredDatarL   r+   r   selected_clustersZraw_condensed_treeclusterZsub_clusterZcluster_exemplarsr   Zleaf_max_lambdaZpointsr1   r1   r2   r(     sF    




r(   c                    s  | j }| j}t||r*t dd}nt dddd }||d dk }dd |D  |d	kr6|D ]n}|d
 |k}	tfdd|d |	 D }
|
| krd |< |
|< qnt||D ]}||krd |< qqn|dkrt fdd D }t	||||} D ]"}||kr(d |< nd |< qn|dkrtt
|}t|dkr~ D ]}d |< q^d |d
  < |dkrt	||||}n|} D ]"}||krd |< nd |< qntdt fdd D }|S )z
    Adaptation of get_clusters from hdbscan._hdbscan_tree.
    Avoids the label and proba computation at the end,
        and returns only the selected clusters instead.
    T)reverseNr4   rf   r   c                 S   s   i | ]
}|d qS )Tr1   )rm   r|   r1   r1   r2   rp   :  s      z(_new_select_clusters.<locals>.<dictcomp>r   rq   c                    s   g | ]} | qS r1   r1   )rm   rr   )	stabilityr1   r2   
<listcomp>?  s   z(_new_select_clusters.<locals>.<listcomp>rr   Fr   c                    s   g | ]} | r|qS r1   r1   rm   ro   
is_clusterr1   r2   r   K  s      r   r   rc   c                    s   g | ]} | rt |qS r1   rl   r   r   r1   r2   r   l  s      )rY   r   r   rt   keysr9   r:   r   setepsilon_searchr   r    minr=   )rL   r   r&   r/   rE   r   Z	node_listr@   nodeZchild_selectionZsubtree_stabilitysub_nodeZeom_clustersr{   ro   leavesr]   r1   )r   r~   r2   rj   &  sb    	









rj   c           
      C   s   g }g }| D ]}d|d |d |k d  }||k r||krt ||||}t|dr\|d }|| t||D ]}	|	|krp||	 qpq|| qt|S )Nr   rU   rr   r   __len__)traverse_upwardshasattrrz   r   r   )
r   r@   r   r&   r{   	processedr   epsZepsilon_childr   r1   r1   r2   r   p  s,      

r   c                 C   sj   | d   }| | d |k d }||kr4|r0|S |S d| | d |k d  }||krX|S t| |||S d S )Nrq   rr   r   rU   )r   r   )r@   r   r   r&   rootrq   Z
parent_epsr1   r1   r2   r     s     r   c                 C   s    t tt| j t jS )zA
    Extract selected clusters from PredictionData instance.
    )r9   ry   rt   ru   rw   valuesrV   Zintp)r   r1   r1   r2   rX     s
    rX   )Nr   NF)NNNF)NNr   )NNN)Nr   )FF)&__doc__r   warningsr   numpyr9   Z_hdbscan_treer   r   Zhdbscan_r   r   Zplotsr   Z
predictionr	   r
   r   Z_prediction_utilsr   r   r   r   r   r   r   r3   rR   r`   rb   r#   rd   re   r(   rj   r   r   rX   r1   r1   r1   r2   <module>   sR   $	     
     
 (    
     
w"#   
\  
J