U
    9hu>                     @   st   d dl Zd dlmZ d dlmZ ddlmZ ddlm	Z	 ddd	Z
d
d ZdddZdd ZdddZdddZdS )    N)pairwise_distances)cdist   )mst_linkage_core)isclose       @c                 C   sf   d| | dk  | | | dk< | j dd}|| jd d  }|  dkrVtt| }n|d| C }|S )a  
    Compute the all-points-core-distance for all the points of a cluster.

    Parameters
    ----------
    distance_matrix : array (cluster_size, cluster_size)
        The pairwise distance matrix between points in the cluster.

    d : integer
        The dimension of the data set, which is used in the computation
        of the all-point-core-distance as per the paper.

    Returns
    -------
    core_distances : array (cluster_size,)
        The all-points-core-distance of each point in the cluster

    References
    ----------
    Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J.,
    2014. Density-Based Clustering Validation. In SDM (pp. 839-847).
    g      ?r   r   Zaxisg      )sumshapenpZzeroslen)distance_matrixdresult r   c/var/www/html/CrowdFlow/HYROX/ble_analysis_env_py38/lib/python3.8/site-packages/hdbscan/validity.pyall_points_core_distance   s    r   c                 C   sn   d}t | jd D ]V}t | jd D ]B}| | | d }| | | d }|dks$|| |kr^q$|| }q$q|S )Nr   r   )ranger
   )stacked_distances	max_ratioijdistZcoredistr   r   r   r   +   s    r   	euclideanFc                 K   s   |dkr>|dkrt d| ||kddf dd||kf }n2| ||kddf }	t|	fd|i|}| jd }|r||dfS t| |d}
t|
|
jd df}t|||jg}|rt	dt
t|  |jd	d
|
fS dS )aa  
    Compute pairwise distances for all the points of a cluster.

    If metric is 'precomputed' then assume X is a distance matrix for the full
    dataset. Note that in this case you must pass in 'd' the dimension of the
    dataset.

    Parameters
    ----------
    X : array (n_samples, n_features) or (n_samples, n_samples)
        The input data of the clustering. This can be the data, or, if
        metric is set to `precomputed` the pairwise distance matrix used
        for the clustering.

    labels : array (n_samples)
        The label array output by the clustering, providing an integral
        cluster label to each data point, with -1 for noise points.

    cluster_id : integer
        The cluster label for which to compute the distances

    metric : string
        The metric used to compute distances for the clustering (and
        to be re-used in computing distances for mr distance). If
        set to `precomputed` then X is assumed to be the precomputed
        distance matrix between samples.

    d : integer (or None)
        The number of features (dimension) of the dataset. This need only
        be set in the case of metric being set to `precomputed`, where
        the ambient dimension of the data is unknown to the function.

    **kwd_args :
        Extra arguments to pass to the distance computation for other
        metrics, such as minkowski, Mahanalobis etc.

    Returns
    -------

    distances : array (n_samples, n_samples)
        The distances between all points in `X` with `label` equal to `cluster_id`.

    core_distances : array (n_samples,)
        The all-points-core_distance of all points in `X` with `label` equal
        to `cluster_id`.

    References
    ----------
    Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J.,
    2014. Density-Based Clustering Validation. In SDM (pp. 839-847).
    precomputedNz4If metric is precomputed a d value must be provided!metricr   )r   r   z(Max raw distance to coredistance ratio: r   )
ValueErrorr   r
   r   copyr   tiledstackTprintstrr   max)Xlabels
cluster_idr   r   no_coredistprint_max_raw_to_coredist_ratiokwd_argsr   Zsubset_Xcore_distancesZcore_dist_matrixr   r   r   r   distances_between_points8   s&    6&

r,   c           	      C   s,  t | }| }t|dd dD ]z\}}tt| t|d  |d d }t||d|ddf t}|||d k }t	|dkst
|d |d< q"t| jd t|jdd  tjdk }t	|sdg}tjt|jdd |ddt}t|r|| }n| }||fS )a6  
    Compute the 'internal' minimum spanning tree given a matrix of mutual
    reachability distances. Given a minimum spanning tree the 'internal'
    graph is the subgraph induced by vertices of degree greater than one.

    Parameters
    ----------
    mr_distances : array (cluster_size, cluster_size)
        The pairwise mutual reachability distances, inferred to be the edge
        weights of a complete graph. Since MSTs are computed per cluster
        this is the all-points-mutual-reacability for points within a single
        cluster.

    Returns
    -------
    internal_nodes : array
        An array listing the indices of the internal nodes of the MST

    internal_edges : array (?, 3)
        An array of internal edges in weighted edge list format; that is
        an edge is an array of length three listing the two vertices
        forming the edge and weight of the edge.

    References
    ----------
    Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J.,
    2014. Density-Based Clustering Validation. In SDM (pp. 839-847).
    r   N   r   r   )r   r   	enumerater   wherer   intZintersect1dZastyper   AssertionErrorZaranger
   Zbincountr!   flattenZintpprodisinboolany)	Zmr_distancesZsingle_linkage_dataZmin_span_treeindexrow
candidatesZverticesZedge_selectionedgesr   r   r   internal_minimum_spanning_tree   s6    $" 
r;   c
                 K   s   |dkrJ| ||kddf dd||kf }||ddf dd|f }n0| ||k | }| ||k | }t |||f|
}|	r| S t|| |jd dfj}t|| |jd df}t|||gjdd}| S dS )a  
    Compute the density separation between two clusters. This is the minimum
    distance between pairs of points, one from internal nodes of MSTs of each cluster.

    Parameters
    ----------
    X : array (n_samples, n_features) or (n_samples, n_samples)
        The input data of the clustering. This can be the data, or, if
        metric is set to `precomputed` the pairwise distance matrix used
        for the clustering.

    labels : array (n_samples)
        The label array output by the clustering, providing an integral
        cluster label to each data point, with -1 for noise points.

    cluster_id1 : integer
        The first cluster label to compute separation between.

    cluster_id2 : integer
        The second cluster label to compute separation between.

    internal_nodes1 : array
        The vertices of the MST for `cluster_id1` that were internal vertices.

    internal_nodes2 : array
        The vertices of the MST for `cluster_id2` that were internal vertices.

    core_distances1 : array (size of cluster_id1,)
        The all-points-core_distances of all points in the cluster
        specified by cluster_id1.

    core_distances2 : array (size of cluster_id2,)
        The all-points-core_distances of all points in the cluster
        specified by cluster_id2.

    metric : string
        The metric used to compute distances for the clustering (and
        to be re-used in computing distances for mr distance). If
        set to `precomputed` then X is assumed to be the precomputed
        distance matrix between samples.

    **kwd_args :
        Extra arguments to pass to the distance computation for other
        metrics, such as minkowski, Mahanalobis etc.

    Returns
    -------
    The 'density separation' between the clusters specified by
    `cluster_id1` and `cluster_id2`.

    References
    ----------
    Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J.,
    2014. Density-Based Clustering Validation. In SDM (pp. 839-847).
    r   Nr   r   r   r   )r   minr   r   r
   r!   r    r$   )r%   r&   Zcluster_id1Zcluster_id2Zinternal_nodes1Zinternal_nodes2Zcore_distances1Zcore_distances2r   r(   r*   Z
sub_selectr   Zcluster1Zcluster2Zcore_dist_matrix1Zcore_dist_matrix2Zmr_dist_matrixr   r   r   density_separation   s*    ;$

r=   c                 K   s"  i }i }	i }
i }|  d }tjtj||ftjd }tj|tjd}t|D ]h}t||kdkrfqNt| ||||f||d|\}||< t	|\|
|< ||< || j
d   |	|< qNt|D ]}t||kdkrq|
| }t|d |D ]j}t||kdkrq|
| }t| ||||||| || f||d||||f< |||f |||f< qqt| jd }d}t|D ]}t||kdkrqv||  }||	|  t ||	|  ||< |rtdt|  tdt|	|   t||k}||| ||  7 }qv|r||fS |S d	S )
a^
  
    Compute the density based cluster validity index for the
    clustering specified by `labels` and for each cluster in `labels`.

    Parameters
    ----------
    X : array (n_samples, n_features) or (n_samples, n_samples)
        The input data of the clustering. This can be the data, or, if
        metric is set to `precomputed` the pairwise distance matrix used
        for the clustering.

    labels : array (n_samples)
        The label array output by the clustering, providing an integral
        cluster label to each data point, with -1 for noise points.

    metric : optional, string (default 'euclidean')
        The metric used to compute distances for the clustering (and
        to be re-used in computing distances for mr distance). If
        set to `precomputed` then X is assumed to be the precomputed
        distance matrix between samples.

    d : optional, integer (or None) (default None)
        The number of features (dimension) of the dataset. This need only
        be set in the case of metric being set to `precomputed`, where
        the ambient dimension of the data is unknown to the function.

    per_cluster_scores : optional, boolean (default False)
        Whether to return the validity index for individual clusters.
        Defaults to False with the function returning a single float
        value for the whole clustering.

    mst_raw_dist : optional, boolean (default False)
        If True, the MST's are constructed solely via 'raw' distances (depending on the given metric, e.g. euclidean distances)
        instead of using mutual reachability distances. Thus setting this parameter to True avoids using 'all-points-core-distances' at all.
        This is advantageous specifically in the case of elongated clusters that lie in close proximity to each other <citation needed>.

    **kwd_args :
        Extra arguments to pass to the distance computation for other
        metrics, such as minkowski, Mahanalobis etc.

    Returns
    -------
    validity_index : float
        The density based cluster validity index for the clustering. This
        is a numeric value between -1 and 1, with higher values indicating
        a 'better' clustering.

    per_cluster_validity_index : array (n_clusters,)
        The cluster validity index of each individual cluster as an array.
        The overall validity index is the weighted average of these values.
        Only returned if per_cluster_scores is set to True.

    References
    ----------
    Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J.,
    2014. Density-Based Clustering Validation. In SDM (pp. 839-847).
    r   )Zdtyper   )r(   r)   r-   )r   r(   zMinimum density separation: zDensity sparseness: N)r$   r   infZonesZfloat64emptyr   r	   r,   r;   r!   r=   floatr
   r<   r"   r#   )r%   r&   r   r   Zper_cluster_scoresZmst_raw_distverboser*   r+   Zdensity_sparsenessZ	mst_nodesZ	mst_edgesZmax_cluster_idZdensity_sepZcluster_validity_indicesr'   Zdistances_for_mstr   Zinternal_nodes_ir   Zinternal_nodes_jZ	n_samplesr   Zmin_density_sepZcluster_sizer   r   r   validity_index  s    ;      
rB   )r   )r   NFF)r   F)r   NFFF)numpyr   Zsklearn.metricsr   Zscipy.spatial.distancer   Z_hdbscan_linkager   Zhdbscan_r   r   r   r,   r;   r=   rB   r   r   r   r   <module>   s*   
$      
PF   
S        