|
26 | 26 | import scipy as sp
|
27 | 27 |
|
28 | 28 | from dadapy._cython import cython_clustering as cf
|
| 29 | +from dadapy._cython import cython_density as cd |
| 30 | +from dadapy._utils.density_estimation import ( |
| 31 | + return_not_normalised_density_kstarNN, |
| 32 | + return_not_normalised_density_PAk, |
| 33 | +) |
| 34 | +from dadapy._utils.utils import compute_cross_nn_distances |
29 | 35 | from dadapy.density_estimation import DensityEstimation
|
30 | 36 |
|
31 | 37 | cores = multiprocessing.cpu_count()
|
@@ -332,6 +338,116 @@ def compute_clustering_ADP_pure_python( # noqa: C901
|
332 | 338 |
|
333 | 339 | return self.cluster_assignment
|
334 | 340 |
|
| 341 | + def predict_cluster_DP(self, X_new, Dthr=23.92812698, density_est="PAk"): |
| 342 | + """Compute clustering for points outside the initialization set using Density Peaks. |
| 343 | +
|
| 344 | + Args: |
| 345 | + X_new (np.ndarray(float)): The points for which to predict cluster assignment |
| 346 | + Dthr (float): Likelihood ratio parameter used to compute optimal k, the value of Dthr=23.92 corresponds |
| 347 | + to a p-value of 1e-6. |
| 348 | +
|
| 349 | + Returns: |
| 350 | + cluster_prediction (np.ndarray(int)): prediction of points to specific clusters |
| 351 | + cluster_probability (np.ndarray(float)): probability of points to belong to specific clusters |
| 352 | + """ |
| 353 | + cross_distances, cross_dist_indices = compute_cross_nn_distances( |
| 354 | + X_new, self.X, self.maxk, self.metric, self.period |
| 355 | + ) |
| 356 | + |
| 357 | + kstar = cd._compute_kstar_interp( |
| 358 | + self.intrinsic_dim, |
| 359 | + X_new.shape[0], |
| 360 | + self.maxk, |
| 361 | + Dthr, |
| 362 | + cross_dist_indices, |
| 363 | + cross_distances, |
| 364 | + self.distances, |
| 365 | + ) |
| 366 | + if density_est == "PAk": |
| 367 | + log_den, log_den_err, dc = return_not_normalised_density_PAk( |
| 368 | + cross_distances, |
| 369 | + self.intrinsic_dim, |
| 370 | + kstar, |
| 371 | + self.maxk, |
| 372 | + interpolation=True, |
| 373 | + ) |
| 374 | + elif density_est == "kstarNN": |
| 375 | + log_den, log_den_err, dc = return_not_normalised_density_kstarNN( |
| 376 | + cross_distances, self.intrinsic_dim, kstar, interpolation=True |
| 377 | + ) |
| 378 | + |
| 379 | + log_den -= np.log(self.N) |
| 380 | + |
| 381 | + cluster_probability = np.zeros((len(X_new), self.N_clusters)) |
| 382 | + for i in np.arange(len(X_new)): |
| 383 | + higher_density_neighbours = ( |
| 384 | + self.log_den[cross_dist_indices][i] |
| 385 | + - self.log_den_err[cross_dist_indices][i] |
| 386 | + > log_den[i] - log_den_err[i] |
| 387 | + ) |
| 388 | + try: |
| 389 | + index_nearest_neighbour_higher_density = cross_dist_indices[i][ |
| 390 | + higher_density_neighbours |
| 391 | + ][0] |
| 392 | + cluster_probability[ |
| 393 | + i, self.cluster_assignment[index_nearest_neighbour_higher_density] |
| 394 | + ] = 1 |
| 395 | + # If no data with higher density is found in the neighbourhood, |
| 396 | + # predict the cluster of the closest data point |
| 397 | + except IndexError: |
| 398 | + cluster_probability[ |
| 399 | + i, self.cluster_assignment[cross_dist_indices[0]] |
| 400 | + ] = 1 |
| 401 | + cluster_prediction = np.argmax(cluster_probability, axis=-1) |
| 402 | + return cluster_prediction, cluster_probability |
| 403 | + |
| 404 | + def predict_cluster_inverse_distance_smooth(self, X_new, Dthr=23.92812698): |
| 405 | + """Compute clustering for points outside the initialization set using a smooth estimator. |
| 406 | +
|
| 407 | + Args: |
| 408 | + X_new (np.ndarray(float)): The points for which to predict cluster assignment |
| 409 | + Dthr (float): Likelihood ratio parameter used to compute optimal k, the value of Dthr=23.92 corresponds |
| 410 | + to a p-value of 1e-6. |
| 411 | +
|
| 412 | + Returns: |
| 413 | + cluster_prediction (np.ndarray(int)): prediction of points to specific clusters |
| 414 | + cluster_probability (np.ndarray(float)): probability of points to belong to specific clusters |
| 415 | + """ |
| 416 | + cross_distances, cross_dist_indices = compute_cross_nn_distances( |
| 417 | + X_new, self.X, self.maxk, self.metric, self.period |
| 418 | + ) |
| 419 | + |
| 420 | + kstar = cd._compute_kstar_interp( |
| 421 | + self.intrinsic_dim, |
| 422 | + X_new.shape[0], |
| 423 | + self.maxk, |
| 424 | + Dthr, |
| 425 | + cross_dist_indices, |
| 426 | + cross_distances, |
| 427 | + self.distances, |
| 428 | + ) |
| 429 | + |
| 430 | + cluster_assignment_kstar = [ |
| 431 | + self.cluster_assignment[cross_dist_indices[i][: kstar[i]]] |
| 432 | + for i in np.arange(len(X_new)) |
| 433 | + ] |
| 434 | + cross_distances_kstar = [ |
| 435 | + cross_distances[i][: kstar[i]] for i in np.arange(len(X_new)) |
| 436 | + ] |
| 437 | + cluster_probability = np.zeros((len(cluster_assignment_kstar), self.N_clusters)) |
| 438 | + |
| 439 | + for i in np.arange(len(X_new)): |
| 440 | + if len(set(cluster_assignment_kstar[i])) == 1: |
| 441 | + cluster_probability[i, cluster_assignment_kstar[i][0]] = 1 |
| 442 | + else: |
| 443 | + cluster_probability[i] = _cluster_weight_smooth_assignment( |
| 444 | + cross_distances_kstar[i], |
| 445 | + cluster_assignment_kstar[i], |
| 446 | + self.N_clusters, |
| 447 | + ) |
| 448 | + cluster_prediction = np.argmax(cluster_probability, axis=-1) |
| 449 | + return cluster_prediction, cluster_probability |
| 450 | + |
335 | 451 | # ------------ helper methods for compute_clustering_ADP_pure_python ------------ #
|
336 | 452 |
|
337 | 453 | def _find_density_modes(self, g):
|
@@ -628,3 +744,19 @@ def _finalise_clustering( # noqa: C901
|
628 | 744 | log_den_bord_err_m,
|
629 | 745 | bord_indices_m,
|
630 | 746 | )
|
| 747 | + |
| 748 | + |
| 749 | +def _cluster_weight_smooth_assignment( |
| 750 | + cross_distances_kstar, cluster_assignment_kstar, N_clusters |
| 751 | +): |
| 752 | + |
| 753 | + weights_kstar = ( |
| 754 | + cross_distances_kstar[-1] - cross_distances_kstar |
| 755 | + ) ** 2 / cross_distances_kstar**2 |
| 756 | + normalization = np.sum(weights_kstar) |
| 757 | + cluster_probability = [ |
| 758 | + np.sum(weights_kstar[cluster_assignment_kstar == c]) |
| 759 | + for c in np.arange(N_clusters) |
| 760 | + ] |
| 761 | + cluster_probability = np.array(cluster_probability) / normalization |
| 762 | + return cluster_probability |
0 commit comments