Skip to content

Commit 8a6a89e

Browse files
authored
Bugs and performance issues related to PCov-ized sample selection methods (#118)
* Possible bug in the pcovr orthogonalization concept. Loooots of printf debug statements. Also changed regression to lstsq which makes the y orthogonalization 10x faster * Force y to be 2D in pcov selection routines
1 parent fe24f38 commit 8a6a89e

File tree

4 files changed

+20
-15
lines changed

4 files changed

+20
-15
lines changed

skcosmo/_selection.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88

99
import numpy as np
1010
import scipy
11-
from scipy.linalg import eig
12-
from scipy.sparse.linalg import eigs as speig
11+
from scipy.linalg import eigh
12+
from scipy.sparse.linalg import eigsh
1313
from sklearn.base import (
1414
BaseEstimator,
1515
MetaEstimatorMixin,
@@ -142,6 +142,11 @@ def fit(self, X, y=None, warm_start=False):
142142
force_all_finite=not tags.get("allow_nan", True),
143143
multi_output=True,
144144
)
145+
if len(y.shape) == 1:
146+
# force y to have multi_output 2D format even when it's 1D, since
147+
# many functions, most notably PCov routines, assume an array storage
148+
# format, most notably to compute (y @ y.T)
149+
y = y.reshape((len(y), 1))
145150
else:
146151
X = check_array(
147152
X,
@@ -659,6 +664,8 @@ def _init_greedy_search(self, X, y, n_to_select):
659664
features and computes their initial importance score.
660665
"""
661666

667+
self.X_ref_ = X
668+
self.y_ref_ = y
662669
self.X_current_ = X.copy()
663670
if y is not None:
664671
self.y_current_ = y.copy()
@@ -760,15 +767,16 @@ def _compute_pi(self, X, y=None):
760767
)
761768

762769
if self.k < pcovr_distance.shape[0] - 1:
763-
v, U = speig(pcovr_distance, k=self.k, tol=1e-12)
770+
v, U = eigsh(pcovr_distance, k=self.k, tol=1e-12)
764771
else:
765-
v, U = eig(pcovr_distance)
772+
v, U = eigh(pcovr_distance)
766773
U = U[:, np.flip(np.argsort(v))]
767774
pi = (np.real(U)[:, : self.k] ** 2.0).sum(axis=1)
768775

769776
return pi
770777

771778
def _orthogonalize(self, last_selected):
779+
772780
if self._axis == 1:
773781
self.X_current_ = X_orthogonalizer(
774782
x1=self.X_current_, c=last_selected, tol=self.tolerance
@@ -777,18 +785,17 @@ def _orthogonalize(self, last_selected):
777785
self.X_current_ = X_orthogonalizer(
778786
x1=self.X_current_.T, c=last_selected, tol=self.tolerance
779787
).T
780-
781788
if self.y_current_ is not None:
782789
if self._axis == 1:
783790
self.y_current_ = Y_feature_orthogonalizer(
784791
self.y_current_, X=self.X_selected_, tol=self.tolerance
785792
)
786793
else:
787794
self.y_current_ = Y_sample_orthogonalizer(
788-
self.y_current_,
789-
self.X_current_,
790-
y_ref=self.y_selected_,
791-
X_ref=self.X_selected_,
795+
self.y_ref_,
796+
self.X_ref_,
797+
y_ref=self.y_selected_[: self.n_selected_],
798+
X_ref=self.X_selected_[: self.n_selected_],
792799
tol=self.tolerance,
793800
)
794801

skcosmo/utils/_orthogonalizers.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -135,9 +135,7 @@ def Y_sample_orthogonalizer(y, X, y_ref, X_ref, tol=1e-12, copy=True):
135135
136136
"""
137137

138-
y_frag = (
139-
X @ (np.linalg.pinv(X_ref.T @ X_ref, rcond=tol) @ X_ref.T) @ y_ref
140-
).reshape(y.shape)
138+
y_frag = (X @ (np.linalg.lstsq(X_ref, y_ref, rcond=tol)[0])).reshape(y.shape)
141139

142140
if copy:
143141
return y.copy() - y_frag

tests/test_sample_pcov_cur.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
class TestPCovCUR(unittest.TestCase):
1212
def setUp(self):
1313
self.X, self.y = load_boston(return_X_y=True)
14-
self.idx = [488, 283, 183, 380, 41, 438, 368, 374, 123, 353]
14+
self.idx = [492, 450, 183, 199, 380, 228, 399, 126, 412, 368]
1515

1616
def test_known(self):
1717
"""
@@ -50,7 +50,7 @@ def test_non_it(self):
5050
"""
5151
This test checks that the model can be run non-iteratively
5252
"""
53-
self.idx = [488, 492, 491, 374, 398, 373, 386, 400, 383, 382]
53+
self.idx = [492, 488, 491, 489, 374, 373, 386, 398, 383, 382]
5454
selector = PCovCUR(n_to_select=10, iterative=False)
5555
selector.fit(self.X, self.y)
5656

tests/test_sample_pcov_fps.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
class TestPCovFPS(unittest.TestCase):
99
def setUp(self):
1010
self.X, self.y = load_boston(return_X_y=True)
11-
self.idx = [39, 410, 492, 102, 54, 413, 34, 346, 126, 134, 433, 380]
11+
self.idx = [39, 410, 492, 102, 54, 413, 34, 126, 346, 134, 433, 380]
1212

1313
def test_restart(self):
1414
"""

0 commit comments

Comments
 (0)