U
    9hid                     @   s   d dl Zd dlmZmZ ddlmZ ddlmZ ddl	m
Z
mZmZmZmZmZmZ d dlmZ G dd	 d	eZd
d Zdd Zdd ZdddZdd Zdd Zdd ZdS )    N)KDTreeBallTree   )DistanceMetric)recurse_leaf_dfs)get_tree_row_with_childdist_membership_vectoroutlier_membership_vectorprob_in_some_cluster!all_points_dist_membership_vector$all_points_outlier_membership_vectorall_points_prob_in_some_cluster)warnc                   @   s4   e Zd ZdZeedZdd Zdd Zdd	d
Z	dS )PredictionDataa  
    Extra data that allows for faster prediction if cached.

    Parameters
    ----------

    data : array (n_samples, n_features)
        The original data set that was clustered

    condensed_tree : CondensedTree
        The condensed tree object created by a clustering

    min_samples : int
        The min_samples value used in clustering

    tree_type : string, optional
        Which type of space tree to use for core distance computation.
        One of:
            * ``kdtree``
            * ``balltree``

    metric : string, optional
        The metric used to determine distance for the clustering.
        This is the metric that will be used for the space tree to determine
        core distances etc.

    **kwargs :
        Any further arguments to the metric.

    Attributes
    ----------

    raw_data : array (n_samples, n_features)
        The original data set that was clustered

    tree : KDTree or BallTree
        A space partitioning tree that can be queried for nearest neighbors.

    core_distances : array (n_samples,)
        The core distances for every point in the original data set.

    cluster_map : dict
        A dictionary mapping cluster numbers in the condensed tree to labels
        in the final selected clustering.

    cluster_tree : structured array
        A version of the condensed tree that only contains clusters, not
        individual points.

    max_lambdas : dict
        A dictionary mapping cluster numbers in the condensed tree to the
        maximum lambda value seen in that cluster.
    )kdtreeZballtreec                 C   sB   g }|g}|r>| | | jd t| jd | }| }q
|S )Nchildparent)extendcluster_treenpisintolist)selfclusterresultZ
to_process r   e/var/www/html/CrowdFlow/HYROX/ble_analysis_env_py38/lib/python3.8/site-packages/hdbscan/prediction.py_clusters_belowM   s    

zPredictionData._clusters_belowc                    sF    j  j d |k d }t|dkr*|gS t fdd|D g S d S )Nr   r   r   c                    s   g | ]}t  j|qS r   )r   r   ).0r   r   r   r   
<listcomp>`   s     z4PredictionData._recurse_leaf_dfs.<locals>.<listcomp>)r   lensum)r   Zcurrent_nodechildrenr   r   r   _recurse_leaf_dfsZ   s     z PredictionData._recurse_leaf_dfsr   	euclideanc                 K   s  | tj| _| j| | jfd|i|| _| jj||dd d d df | _tj	|f|| _
t| }|j}dd ttt|D | _dd | j D | _||d d	k | _i | _i | _g | _tt| jd
 | jd g}	|	D ]"}
|d |d
 |
k  | j|
< q|D ]}
|d |d
 |
k  | j|
< | |
D ]&}| j|
 | j|< | j|
 | j|< q<tjg tjd}| |
D ]L}|d |d
 |k  }|d |d
 |k|d |k@  }t||g}q~| j| j|  qd S )Nmetrickr   c                 S   s   i | ]\}}||qS r   r   )r   ncr   r   r   
<dictcomp>n   s      z+PredictionData.__init__.<locals>.<dictcomp>c                 S   s   i | ]\}}||qS r   r   )r   r+   r*   r   r   r   r,   o   s      Z
child_sizer   r   r   
lambda_valZdtype)astyper   float64raw_data_tree_type_maptreequerycore_distancesr   Z
get_metricdist_metricsorted_select_clusters	_raw_tree	enumeratelistcluster_mapitemsZreverse_cluster_mapr   max_lambdasleaf_max_lambdas	exemplarssetZhstackmaxr   arrayZint64r$   append)r   dataZcondensed_treemin_samplesZ	tree_typer&   kwargsZselected_clustersZraw_condensed_treeZall_clustersr   Zsub_clusterZcluster_exemplarsleafZleaf_max_lambdaZpointsr   r   r   __init__b   sR    "



zPredictionData.__init__N)r   r%   )
__name__
__module____qualname____doc__r   r   r2   r   r$   rI   r   r   r   r   r      s   5
	   r   c           
      C   sx   ||  }|| t | jd  }t |||fjdd}| }| | }|| dkrbd||  }	nt t jj}	||	fS )ax  
    Find the nearest mutual reachability neighbor of a point, and  compute
    the associated lambda value for the point, given the mutual reachability
    distance to a nearest neighbor.

    Parameters
    ----------
    neighbor_indices : array (2 * min_samples, )
        An array of raw distance based nearest neighbor indices.

    neighbor_distances : array (2 * min_samples, )
        An array of raw distances to the nearest neighbors.

    core_distances : array (n_samples, )
        An array of core distances for all points

    min_samples : int
        The min_samples value used to generate core distances.

    Returns
    -------
    neighbor : int
        The index into the full raw data set of the nearest mutual reachability
        distance neighbor of the point.

    lambda_ : float
        The lambda value at which this point joins/merges with `neighbor`.
    r   Zaxis              ?)r   onesshapeZvstackrB   ZargminZfinfodouble)
neighbor_indicesneighbor_distancesr5   rF   Zneighbor_core_distancesZpoint_core_distancesZmr_distancesZnn_indexnearest_neighborlambda_r   r   r   _find_neighbor_and_lambda   s"    
rX   c                 C   s   | d   }t||||\}}t| |}|d }	|d |krN|	dd|d f}
nF|	|kr| | d |	k d |kr| d | d |	k d }	qN|	dd|f}
t| |
S )a  
    Create a new condensed tree with an additional point added, allowing for
    computations as if this point had been part of the original tree. Note
    that this makes as little change to the tree as possible, with no
    re-optimizing/re-condensing so that the selected clusters remain
    effectively unchanged.

    Parameters
    ----------
    tree : structured array
        The raw format condensed tree to update.

    neighbor_indices : array (2 * min_samples, )
        An array of raw distance based nearest neighbor indices.

    neighbor_distances : array (2 * min_samples, )
        An array of raw distances to the nearest neighbors.

    core_distances : array (n_samples, )
        An array of core distances for all points

    min_samples : int
        The min_samples value used to generate core distances.

    Returns
    -------
    new_tree : structured array
        The original tree with an extra row providing the parent cluster
        and lambda information for a new point given index -1.
    r   r-   r)   r   r   r   )minrX   r   r   rD   )r3   rT   rU   r5   rF   	tree_rootrV   rW   neighbor_tree_rowpotential_clusterZnew_tree_rowr   r   r   _extend_condensed_tree   s0     
r]   c                 C   s   | j }|d  }	t||||\}
}t||
}|d }|d |kr|||	kr||d |d |k |kr||d |d |k d }qB||kr|| }nd}|dkr|| }|dkrt||}|| }qd}nd}|||
fS )aY  
    Return the cluster label (of the original clustering) and membership
    probability of a new data point.

    Parameters
    ----------
    tree : CondensedTree
        The condensed tree associated with the clustering.

    cluster_tree : structured_array
        The raw form of the condensed tree with only cluster information (no
        data on individual points). This is significantly more compact.

    neighbor_indices : array (2 * min_samples, )
        An array of raw distance based nearest neighbor indices.

    neighbor_distances : array (2 * min_samples, )
        An array of raw distances to the nearest neighbors.

    core_distances : array (n_samples, )
        An array of core distances for all points

    cluster_map : dict
        A dictionary mapping cluster numbers in the condensed tree to labels
        in the final selected clustering.

    max_lambdas : dict
        A dictionary mapping cluster numbers in the condensed tree to the
        maximum lambda value seen in that cluster.

    min_samples : int
        The min_samples value used to generate core distances.
    r   r-   r   r   r)   rO   rP   )r9   rY   rX   r   )r3   r   rT   rU   r5   r<   r>   rF   Zraw_treerZ   rV   rW   r[   r\   Zcluster_label
max_lambdaprobr   r   r   _find_cluster_and_probability   sB    %



r`   Fc              
   C   s  | j dkrtdt|}|jd | j jjd kr<td| j jjd dkrtd dtj|jd tj	d }tj
|jd tjd}|rdtj|jd tj	d }|||fS ||fS tj|jd tj	d}tj|jd tjd}|rtj|jd tj	d}| jp| j}| j jj|d	| d
\}}t|jd D ]Z}	t| j| j j||	 ||	 | j j| j j| j j|\}
}}|
||	< |||	< |r2|||	< q2|r|||fS ||fS )a  Predict the cluster label of new points. The returned labels
    will be those of the original clustering found by ``clusterer``,
    and therefore are not (necessarily) the cluster labels that would
    be found by clustering the original data combined with
    ``points_to_predict``, hence the 'approximate' label.

    If you simply wish to assign new points to an existing clustering
    in the 'best' way possible, this is the function to use. If you
    want to predict how ``points_to_predict`` would cluster with
    the original data under HDBSCAN the most efficient existing approach
    is to simply recluster with the new point(s) added to the original dataset.

    Parameters
    ----------
    clusterer : HDBSCAN
        A clustering object that has been fit to the data and
        either had ``prediction_data=True`` set, or called the
        ``generate_prediction_data`` method after the fact.

    points_to_predict : array, or array-like (n_samples, n_features)
        The new data points to predict cluster labels for. They should
        have the same dimensionality as the original dataset over which
        clusterer was fit.
    
    return_connecting_points : bool, optional
        Whether to return the index of the nearest neighbor in the original
        dataset for each of the ``points_to_predict``. Default is False

    Returns
    -------
    labels : array (n_samples,)
        The predicted labels of the ``points_to_predict``

    probabilities : array (n_samples,)
        The soft cluster scores for each of the ``points_to_predict``

    neighbors : array (n_samples,)
        The index of the nearest neighbor in the original dataset for each
        of the ``points_to_predict``. Only returned if 
        ``return_connecting_points=True``.

    See Also
    --------
    :py:func:`hdbscan.predict.membership_vector`
    :py:func:`hdbscan.predict.all_points_membership_vectors`

    NClusterer does not have prediction data! Try fitting with prediction_data=True set, or run generate_prediction_data on the clustererr   -New points dimension does not match fit data!r   z`Clusterer does not have any defined clusters, new data will be automatically predicted as noise.r)   r.      r'   )prediction_data_
ValueErrorr   asarrayrR   r1   r   r   rQ   int32zerosZfloat32emptyr0   rF   min_cluster_sizer3   r4   ranger`   condensed_tree_r5   r<   r>   )	clustererpoints_to_predictZreturn_connecting_pointslabelsZprobabilitiesZ	neighborsrF   rU   rT   ilabelr_   Zneighborr   r   r   approximate_predictK  sT    0






rr   c                 C   s  z
| j  W n tk
r&   tdY nX t|}|jd | j jjd krRtd| j jjd dkrtd tj	|jd tj
d}|S tj|jd tjd}| jp| j}| j jj|d| d\}}| jj}|d	 }| }i }	t|d	 D ] }
||d	 |
k d
  |	|
< qt|D ]H}|d | }||k r8 qb|| }
|	| |	|
 kr|	| |	|
< qt|jd D ]|}t|| || | j j|\}}t||}|d	 }||  dkr|d
 }|	| }|dkr|| | ||< nd||< qp|S )a  Predict the outlier score of new points. The returned scores
    will be based on the original clustering found by ``clusterer``,
    and therefore are not (necessarily) the outlier scores that would
    be found by clustering the original data combined with
    ``points_to_predict``, hence the 'approximate' label.

    If you simply wish to calculate the outlier scores for new points
    in the 'best' way possible, this is the function to use. If you
    want to predict the outlier score of ``points_to_predict`` with
    the original data under HDBSCAN the most efficient existing approach
    is to simply recluster with the new point(s) added to the original dataset.

    Parameters
    ----------
    clusterer : HDBSCAN
        A clustering object that has been fit to the data and
        either had ``prediction_data=True`` set, or called the
        ``generate_prediction_data`` method after the fact.

    points_to_predict : array, or array-like (n_samples, n_features)
        The new data points to predict cluster labels for. They should
        have the same dimensionality as the original dataset over which
        clusterer was fit.

    Returns
    -------
    scores : array (n_samples,)
        The predicted scores of the ``points_to_predict``

    See Also
    --------
    :py:func:`hdbscan.predict.membership_vector`
    :py:func:`hdbscan.predict.all_points_membership_vectors`

    ra   r   rb   r   zcClusterer does not have any defined clusters, new data will be automatically predicted as outliers.r.   rc   r'   r   r-   r   rO   )rd   AttributeErrorre   r   rf   rR   r1   r   r   rQ   rg   ri   r0   rF   rj   r3   r4   rl   r9   rY   uniquerB   Zargsortrk   rX   r5   r   )rm   rn   ZscoresrF   rU   rT   r3   Zparent_arrayrZ   r>   r   r*   r   rp   ZneighrW   r[   r\   r^   r   r   r   approximate_predict_scores  s`    $





ru   c                 C   sV  | tj}ttt| j  tj}tj	|j
d |j
d ftjd}| jpV| j}| jjj|d| d\}}t|j
d D ]}t|| || | jj|\}}	t| jj|}
|
d |	kr|
d }	t|| | jj| jj}t||	|| jj| jj| jj}|d |d  ||< ||  ||    < ||  t||	|| jj| jj| jj9  < q|S )a  Predict soft cluster membership. The result produces a vector
    for each point in ``points_to_predict`` that gives a probability that
    the given point is a member of a cluster for each of the selected clusters
    of the ``clusterer``.

    Parameters
    ----------
    clusterer : HDBSCAN
        A clustering object that has been fit to the data and
        either had ``prediction_data=True`` set, or called the
        ``generate_prediction_data`` method after the fact.

    points_to_predict : array, or array-like (n_samples, n_features)
        The new data points to predict cluster labels for. They should
        have the same dimensionality as the original dataset over which
        clusterer was fit.

    Returns
    -------
    membership_vectors : array (n_samples, n_clusters)
        The probability that point ``i`` is a member of cluster ``j`` is
        in ``membership_vectors[i, j]``.

    See Also
    --------
    :py:func:`hdbscan.predict.predict`
    :py:func:`hdbscan.predict.all_points_membership_vectors`
r   r.   rc   r'   r-   g      ?g       @)r/   r   r0   rC   r7   r;   rl   r8   intpri   rR   rF   rj   rd   r3   r4   rk   rX   r5   r   r9   r   r@   r6   r	   r?   r   r"   r
   )rm   rn   clustersr   rF   rU   rT   rp   rV   rW   r[   Zdistance_vecZoutlier_vecr   r   r   membership_vector  sh    
 

rx   c                 C   s   t tt| j t j}| jj	}|j
dkrBt |jd S t|| jj| jj}t|| jj| jj| jj}t|| jj| jj| jj}|| }|jdd}||ddt jf  }||ddt jf 9 }|S )a  Predict soft cluster membership vectors for all points in the
    original dataset the clusterer was trained on. This function is more
    efficient by making use of the fact that all points are already in the
    condensed tree, and processing in bulk.

    Parameters
    ----------
    clusterer : HDBSCAN
         A clustering object that has been fit to the data and
        either had ``prediction_data=True`` set, or called the
        ``generate_prediction_data`` method after the fact.
        This method does not work if the clusterer was trained
        with ``metric='precomputed'``.

    Returns
    -------
    membership_vectors : array (n_samples, n_clusters)
        The probability that point ``i`` of the original dataset is a member of
        cluster ``j`` is in ``membership_vectors[i, j]``.

    See Also
    --------
    :py:func:`hdbscan.predict.predict`
    :py:func:`hdbscan.predict.all_points_membership_vectors`
    r   r   rN   N)r   rC   r7   r;   rl   r8   r/   rv   rd   r1   sizerh   rR   r   r@   r6   r   r9   r?   r   r   r"   Znewaxis)rm   rw   Z
all_pointsZdistance_vecsZoutlier_vecsZin_cluster_probsr   Zrow_sumsr   r   r   all_points_membership_vectorsm  s4     
rz   )F)numpyr   Zsklearn.neighborsr   r   Zdist_metricsr   Z_hdbscan_treer   Z_prediction_utilsr   r   r	   r
   r   r   r   warningsr   objectr   rX   r]   r`   rr   ru   rx   rz   r   r   r   r   <module>   s   $}2;L
diU