Skip to content

Fixing negative distances for fitted points with the DirectionalConvexHull #165

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 87 additions & 25 deletions skmatter/sample_selection/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,29 @@ def __init__(
)


def _directional_distance(equations, points):
"""
Computes the distance of the points to the planes defined by the equations
with respect to the direction of the first dimension.

equations : ndarray of shape (n_facets, n_dim)
each row contains the coefficienst for the plane equation of the form
equations[i, 0]*x_1 + ...
+ equations[i, -2]*x_{n_dim} = equations[i, -1]
-equations[i, -1] is the offset
points : ndarray of shape (n_samples, n_dim)
points to compute the directional distance from

Returns
-------
directional_distance : ndarray of shape (nsamples, nequations)
closest distance wrt. the first dimension of the point to the planes
defined by the equations
"""
orthogonal_distances = -(points @ equations[:, :-1].T) - equations[:, -1:].T
return -orthogonal_distances / equations[:, :1].T


class DirectionalConvexHull:
"""
Performs Sample Selection by constructing a Directional Convex Hull and determining the distance to the hull as outlined in the reference
Expand All @@ -411,6 +434,14 @@ class DirectionalConvexHull:
directional convex hull construction (also known as the low-
dimensional (LD) hull). By default [0] is used.

tolerance : float, default=1.0E-12
Tolerance for the negative distances to the directional
convex hull to consider a point below the convex hull. Depending
if a point is below or above the convex hull the distance is
differently computed. A very low value can result in a completely
wrong distances. Distances cannot be distinguished from zero up
to tolerance. It is recommended to leave the default setting.

Attributes
----------

Expand All @@ -426,22 +457,20 @@ class DirectionalConvexHull:
Interpolater for the features in the high-
dimensional space

interpolator_y_ : scipy.interpolate.interpnd.LinearNDInterpolator
Interpolater for the targets y

References
----------
.. [dch] A. Anelli, E. A. Engel, C. J. Pickard and M. Ceriotti,
Physical Review Materials, 2018.
"""

def __init__(self, low_dim_idx=None):
def __init__(self, low_dim_idx=None, tolerance=1e-12):
self.low_dim_idx = low_dim_idx

if low_dim_idx is None:
self.low_dim_idx = [0]
else:
self.low_dim_idx = low_dim_idx
self.tolerance = tolerance

def fit(self, X, y):
"""
Expand Down Expand Up @@ -490,39 +519,43 @@ def fit(self, X, y):
# create high-dimensional feature matrix
high_dim_feats = X[:, self.high_dim_idx_]
# get scipy convex hull
convex_hull = ConvexHull(convex_hull_data)
self.convex_hull_ = ConvexHull(convex_hull_data, incremental=True)
# get normal equations to the hull simplices
y_normal = convex_hull.equations[:, 0]
y_normal = self.convex_hull_.equations[:, 0]

# get vertices_idx of the convex hull
self.selected_idx_ = np.unique(
convex_hull.simplices[np.where(y_normal < 0)[0]].flatten()
)
directional_facets_idx = np.where(y_normal < 0)[0]
self.directional_simplices_ = self.convex_hull_.simplices[
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should probably be a hidden variable, no?

directional_facets_idx
]
self._directional_equations_ = self.convex_hull_.equations[
directional_facets_idx
]

self.selected_idx_ = np.unique(self.directional_simplices_.flatten())
self._directional_points_ = convex_hull_data[self.selected_idx_]

# required for the score_feature_matrix function
self.interpolator_high_dim_ = _linear_interpolator(
points=convex_hull_data[self.selected_idx_, 1:],
values=high_dim_feats[self.selected_idx_],
)

# required to compute the distance of the low-dimensional feature to the convex hull
self.interpolator_y_ = _linear_interpolator(
points=convex_hull_data[self.selected_idx_, 1:],
values=convex_hull_data[self.selected_idx_, 0],
)

return self

@property
def directional_vertices_(self):
return self.selected_idx_

def _check_X_y(self, X, y):
return check_X_y(X, y, ensure_min_features=2, multi_output=False)
return check_X_y(X, y, ensure_min_features=1, multi_output=False)

def _check_is_fitted(self, X):
check_is_fitted(
self,
[
"high_dim_idx_",
"interpolator_high_dim_",
"interpolator_y_",
"selected_idx_",
],
)
Expand Down Expand Up @@ -556,7 +589,7 @@ def score_samples(self, X, y):

Returns
-------
dch_distance : numpy.array of shape (n_samples, len(high_dim_idx_))
dch_distance : ndarray of shape (n_samples, len(high_dim_idx_))
The distance (residuals) of samples to the convex hull in
the higher-dimensional space.
"""
Expand All @@ -565,17 +598,46 @@ def score_samples(self, X, y):

# features used for the convex hull construction
low_dim_feats = X[:, self.low_dim_idx]
hull_space_data = np.hstack((y.reshape(-1, 1), low_dim_feats))

# the X points projected on the convex surface
interpolated_y = self.interpolator_y_(low_dim_feats).reshape(y.shape)
return self._directional_convex_hull_distance(hull_space_data).reshape(y.shape)

if np.any(np.isnan(interpolated_y)):
warnings.warn(
"There are samples in X with a low-dimensional part that is outside of the range of the convex surface. Distance will contain nans.",
UserWarning,
)
def _directional_convex_hull_distance(self, points):
"""
Get the distance to the fitted directional convex hull in the target dimension y.

If a point lies above the convex hull, it will have a positive distance.
If a point lies below the convex hull, it will have a negative distance - this should only
occur if you pass a point that the convex hull has not been fitted on in the `fit` function.
If a point lies on the convex hull, it will have a distance of zero.

return y - interpolated_y
Parameters
----------
points : The points for which you would like to calculate the distance to the
fitted directional convex hull.
"""
# directional distances to all plane equations
all_directional_distances = _directional_distance(
self._directional_equations_, points
)
print(all_directional_distances)
# we get negative distances for each plane to check if any distance is below the threshold
below_directional_convex_hull = np.any(
all_directional_distances < -self.tolerance, axis=1
)
# directional distances to corresponding plane equation
directional_distances = np.zeros(len(points))
directional_distances[~below_directional_convex_hull] = np.min(
all_directional_distances[~below_directional_convex_hull], axis=1
)
# some distances can be positive, so we take the max of all negative distances
negative_directional_distances = all_directional_distances.copy()
negative_directional_distances[all_directional_distances > 0] = -np.inf
directional_distances[below_directional_convex_hull] = np.max(
negative_directional_distances[below_directional_convex_hull], axis=1
)
return directional_distances

def score_feature_matrix(self, X):
"""
Expand Down
70 changes: 56 additions & 14 deletions tests/test_dch.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,64 @@ def test_negative_score(self):
distance = selector.score_samples(
self.below_hull_point[:, 1:], self.below_hull_point[:, 0]
)[0]
print("distance", distance)
self.assertTrue(distance < 0.0)

def test_positive_score(self):
"""
Ensure that when we score on the points we fitted that we obtain only >= 0 distances
In an old implementation we observed this bug for the dataset we use in this test
(see issue #162)
"""
X = [
[1.88421449, 0.86675162],
[1.88652863, 0.86577001],
[1.89200182, 0.86573224],
[1.89664107, 0.86937211],
[1.90181908, 0.85964603],
[1.90313135, 0.85695238],
[1.90063025, 0.84948309],
[1.90929015, 0.87526563],
[1.90924666, 0.85509754],
[1.91139146, 0.86115512],
[1.91199225, 0.8681867],
[1.90681563, 0.85036791],
[1.90193881, 0.84168907],
[1.90544262, 0.84451744],
[1.91498802, 0.86010812],
[1.91305204, 0.85333203],
[1.89779902, 0.83731807],
[1.91725967, 0.86630218],
[1.91309514, 0.85046796],
[1.89822103, 0.83522425],
]
y = [
-2.69180967,
-2.72443825,
-2.77293913,
-2.797828,
-2.12097652,
-2.69428482,
-2.70275134,
-2.80617667,
-2.79199375,
-2.01707974,
-2.74203922,
-2.24217962,
-2.03472,
-2.72612763,
-2.7071123,
-2.75706683,
-2.68925596,
-2.77160335,
-2.69528665,
-2.70911598,
]
selector = DirectionalConvexHull(low_dim_idx=[0, 1])
selector.fit(X, y)
distances = selector.score_samples(X, y)
self.assertTrue(np.all(distances >= -selector.tolerance))

def test_score_function_warnings(self):
"""
Ensure that calling `score_samples` with points outside the range causes an error
Expand All @@ -118,20 +174,6 @@ def test_score_function_warnings(self):
y = [1.0, 2.0, 3.0]
selector.fit(X, y)

# check for score_samples
with warnings.catch_warnings(record=True) as warning:
# Cause all warnings to always be triggered.
warnings.simplefilter("always")
# Trigger a warning because it is outsite of range [0, 3]
selector.score_samples([[4.0, 1.0]], [1.0])
# Verify some things
self.assertTrue(len(warning) == 1)
self.assertTrue(issubclass(warning[0].category, UserWarning))
self.assertTrue(
"There are samples in X with a low-dimensional part that is outside of the range of the convex surface. Distance will contain nans."
== str(warning[0].message)
)

# check for score_feature_matrix
with warnings.catch_warnings(record=True) as warning:
# Cause all warnings to always be triggered.
Expand Down