Skip to content

Commit 356962e

Browse files
committed
Added smooth and non-smooth prediction functions with tests and comments
1 parent aaf469c commit 356962e

File tree

2 files changed

+293
-0
lines changed

2 files changed

+293
-0
lines changed

dadapy/clustering.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,12 @@
2626
import scipy as sp
2727

2828
from dadapy._cython import cython_clustering as cf
29+
from dadapy._cython import cython_density as cd
30+
from dadapy._utils.density_estimation import (
31+
return_not_normalised_density_kstarNN,
32+
return_not_normalised_density_PAk,
33+
)
34+
from dadapy._utils.utils import compute_cross_nn_distances
2935
from dadapy.density_estimation import DensityEstimation
3036

3137
cores = multiprocessing.cpu_count()
@@ -332,6 +338,116 @@ def compute_clustering_ADP_pure_python( # noqa: C901
332338

333339
return self.cluster_assignment
334340

341+
def predict_cluster_DP(self, X_new, Dthr=23.92812698, density_est="PAk"):
342+
"""Compute clustering for points outside the initialization set using Density Peaks.
343+
344+
Args:
345+
X_new (np.ndarray(float)): The points for which to predict cluster assignment
346+
Dthr (float): Likelihood ratio parameter used to compute optimal k, the value of Dthr=23.92 corresponds
347+
to a p-value of 1e-6.
348+
349+
Returns:
350+
cluster_prediction (np.ndarray(int)): prediction of points to specific clusters
351+
cluster_probability (np.ndarray(float)): probability of points to belong to specific clusters
352+
"""
353+
cross_distances, cross_dist_indices = compute_cross_nn_distances(
354+
X_new, self.X, self.maxk, self.metric, self.period
355+
)
356+
357+
kstar = cd._compute_kstar_interp(
358+
self.intrinsic_dim,
359+
X_new.shape[0],
360+
self.maxk,
361+
Dthr,
362+
cross_dist_indices,
363+
cross_distances,
364+
self.distances,
365+
)
366+
if density_est == "PAk":
367+
log_den, log_den_err, dc = return_not_normalised_density_PAk(
368+
cross_distances,
369+
self.intrinsic_dim,
370+
kstar,
371+
self.maxk,
372+
interpolation=True,
373+
)
374+
elif density_est == "kstarNN":
375+
log_den, log_den_err, dc = return_not_normalised_density_kstarNN(
376+
cross_distances, self.intrinsic_dim, kstar, interpolation=True
377+
)
378+
379+
log_den -= np.log(self.N)
380+
381+
cluster_probability = np.zeros((len(X_new), self.N_clusters))
382+
for i in np.arange(len(X_new)):
383+
higher_density_neighbours = (
384+
self.log_den[cross_dist_indices][i]
385+
- self.log_den_err[cross_dist_indices][i]
386+
> log_den[i] - log_den_err[i]
387+
)
388+
try:
389+
index_nearest_neighbour_higher_density = cross_dist_indices[i][
390+
higher_density_neighbours
391+
][0]
392+
cluster_probability[
393+
i, self.cluster_assignment[index_nearest_neighbour_higher_density]
394+
] = 1
395+
# If no data with higher density is found in the neighbourhood,
396+
# predict the cluster of the closest data point
397+
except IndexError:
398+
cluster_probability[
399+
i, self.cluster_assignment[cross_dist_indices[0]]
400+
] = 1
401+
cluster_prediction = np.argmax(cluster_probability, axis=-1)
402+
return cluster_prediction, cluster_probability
403+
404+
def predict_cluster_inverse_distance_smooth(self, X_new, Dthr=23.92812698):
405+
"""Compute clustering for points outside the initialization set using a smooth estimator.
406+
407+
Args:
408+
X_new (np.ndarray(float)): The points for which to predict cluster assignment
409+
Dthr (float): Likelihood ratio parameter used to compute optimal k, the value of Dthr=23.92 corresponds
410+
to a p-value of 1e-6.
411+
412+
Returns:
413+
cluster_prediction (np.ndarray(int)): prediction of points to specific clusters
414+
cluster_probability (np.ndarray(float)): probability of points to belong to specific clusters
415+
"""
416+
cross_distances, cross_dist_indices = compute_cross_nn_distances(
417+
X_new, self.X, self.maxk, self.metric, self.period
418+
)
419+
420+
kstar = cd._compute_kstar_interp(
421+
self.intrinsic_dim,
422+
X_new.shape[0],
423+
self.maxk,
424+
Dthr,
425+
cross_dist_indices,
426+
cross_distances,
427+
self.distances,
428+
)
429+
430+
cluster_assignment_kstar = [
431+
self.cluster_assignment[cross_dist_indices[i][: kstar[i]]]
432+
for i in np.arange(len(X_new))
433+
]
434+
cross_distances_kstar = [
435+
cross_distances[i][: kstar[i]] for i in np.arange(len(X_new))
436+
]
437+
cluster_probability = np.zeros((len(cluster_assignment_kstar), self.N_clusters))
438+
439+
for i in np.arange(len(X_new)):
440+
if len(set(cluster_assignment_kstar[i])) == 1:
441+
cluster_probability[i, cluster_assignment_kstar[i][0]] = 1
442+
else:
443+
cluster_probability[i] = _cluster_weight_smooth_assignment(
444+
cross_distances_kstar[i],
445+
cluster_assignment_kstar[i],
446+
self.N_clusters,
447+
)
448+
cluster_prediction = np.argmax(cluster_probability, axis=-1)
449+
return cluster_prediction, cluster_probability
450+
335451
# ------------ helper methods for compute_clustering_ADP_pure_python ------------ #
336452

337453
def _find_density_modes(self, g):
@@ -628,3 +744,19 @@ def _finalise_clustering( # noqa: C901
628744
log_den_bord_err_m,
629745
bord_indices_m,
630746
)
747+
748+
749+
def _cluster_weight_smooth_assignment(
750+
cross_distances_kstar, cluster_assignment_kstar, N_clusters
751+
):
752+
753+
weights_kstar = (
754+
cross_distances_kstar[-1] - cross_distances_kstar
755+
) ** 2 / cross_distances_kstar**2
756+
normalization = np.sum(weights_kstar)
757+
cluster_probability = [
758+
np.sum(weights_kstar[cluster_assignment_kstar == c])
759+
for c in np.arange(N_clusters)
760+
]
761+
cluster_probability = np.array(cluster_probability) / normalization
762+
return cluster_probability
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
# Copyright 2021-2022 The DADApy Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
# ==============================================================================
15+
16+
"""Module for testing DP clustering."""
17+
18+
import os
19+
20+
import numpy as np
21+
22+
from dadapy import Clustering
23+
24+
filename = os.path.join(os.path.split(__file__)[0], "../2gaussians_in_2d.npy")
25+
26+
X = np.load(filename)
27+
28+
expected_cluster_assignment = np.array(
29+
[
30+
1,
31+
1,
32+
1,
33+
1,
34+
1,
35+
1,
36+
1,
37+
1,
38+
1,
39+
1,
40+
1,
41+
1,
42+
1,
43+
1,
44+
1,
45+
1,
46+
1,
47+
1,
48+
1,
49+
1,
50+
1,
51+
1,
52+
1,
53+
1,
54+
1,
55+
1,
56+
1,
57+
1,
58+
1,
59+
1,
60+
1,
61+
1,
62+
1,
63+
1,
64+
1,
65+
1,
66+
1,
67+
1,
68+
1,
69+
1,
70+
1,
71+
1,
72+
1,
73+
1,
74+
1,
75+
1,
76+
1,
77+
1,
78+
1,
79+
1,
80+
0,
81+
0,
82+
0,
83+
0,
84+
0,
85+
0,
86+
0,
87+
0,
88+
0,
89+
0,
90+
0,
91+
0,
92+
0,
93+
0,
94+
0,
95+
0,
96+
0,
97+
0,
98+
0,
99+
0,
100+
0,
101+
0,
102+
0,
103+
0,
104+
0,
105+
0,
106+
0,
107+
0,
108+
0,
109+
0,
110+
0,
111+
0,
112+
0,
113+
0,
114+
0,
115+
0,
116+
0,
117+
0,
118+
0,
119+
0,
120+
0,
121+
0,
122+
0,
123+
0,
124+
0,
125+
0,
126+
0,
127+
0,
128+
0,
129+
0,
130+
]
131+
)
132+
133+
134+
def test_predict_DP_PAk():
135+
"""Test the prediction using PAk clustering works correctly."""
136+
cl = Clustering(coordinates=X[25:-25])
137+
cl.compute_clustering_ADP(halo=False)
138+
preds0, _ = cl.predict_cluster_DP(X[-25:], density_est="PAk")
139+
preds1, _ = cl.predict_cluster_DP(X[:25], density_est="PAk")
140+
assert (preds1 == cl.cluster_assignment[:25]).all()
141+
assert (preds0 == cl.cluster_assignment[-25:]).all()
142+
143+
144+
def test_predict_DP_kstarNN():
145+
"""Test the prediction using kstarNN clustering works correctly."""
146+
cl = Clustering(coordinates=X[25:-25])
147+
cl.compute_clustering_ADP(halo=False)
148+
preds0, _ = cl.predict_cluster_DP(X[-25:], density_est="kstarNN")
149+
preds1, _ = cl.predict_cluster_DP(X[:25], density_est="kstarNN")
150+
assert (preds1 == cl.cluster_assignment[:25]).all()
151+
assert (preds0 == cl.cluster_assignment[-25:]).all()
152+
153+
154+
def test_predict_smooth():
155+
"""Test the smooth prediction using kstar neighbour distances works correctly."""
156+
cl = Clustering(coordinates=X[25:-25])
157+
cl.compute_clustering_ADP(halo=False)
158+
preds0, _ = cl.predict_cluster_inverse_distance_smooth(X[-25:])
159+
preds1, _ = cl.predict_cluster_inverse_distance_smooth(X[:25])
160+
assert (preds1 == cl.cluster_assignment[:25]).all()
161+
assert (preds0 == cl.cluster_assignment[-25:]).all()

0 commit comments

Comments
 (0)