From 7798af4fd0d7e9bc6467d03dfbde82d240a5f036 Mon Sep 17 00:00:00 2001 From: Tom White Date: Mon, 29 Jun 2020 17:22:56 +0100 Subject: [PATCH 01/20] Translate the discussion in https://discourse.smadstatgen.org/t/xarray-conventions-for-variation-data/44 into code --- setup.cfg | 2 +- sgkit/__init__.py | 16 +++++++-- sgkit/api.py | 70 +++++++++++++++++++++++++++++++++++++++ sgkit/core.py | 7 ---- sgkit/tests/test_api.py | 45 +++++++++++++++++++++++++ sgkit/tests/test_core.py | 10 ------ sgkit/tests/test_utils.py | 20 +++++++++++ sgkit/utils.py | 24 ++++++++++++++ 8 files changed, 174 insertions(+), 20 deletions(-) create mode 100644 sgkit/api.py delete mode 100644 sgkit/core.py create mode 100644 sgkit/tests/test_api.py delete mode 100644 sgkit/tests/test_core.py create mode 100644 sgkit/tests/test_utils.py create mode 100644 sgkit/utils.py diff --git a/setup.cfg b/setup.cfg index d32a77d78..d9f4c83a7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -16,7 +16,7 @@ ignore = [isort] default_section = THIRDPARTY known_first_party = sgkit -known_third_party = numpy,xarray +known_third_party = numpy,pytest,xarray multi_line_output = 3 include_trailing_comma = True force_grid_wrap = 0 diff --git a/sgkit/__init__.py b/sgkit/__init__.py index 15bd521e2..8b170a470 100644 --- a/sgkit/__init__.py +++ b/sgkit/__init__.py @@ -1,3 +1,15 @@ -from .core import create_genotype_call_dataset # noqa: F401 +from .api import ( # noqa: F401 + DIM_ALLELE, + DIM_PLOIDY, + DIM_SAMPLE, + DIM_VARIANT, + create_genotype_call_dataset, +) -__all__ = ["create_genotype_call_dataset"] +__all__ = [ + "DIM_ALLELE", + "DIM_PLOIDY", + "DIM_SAMPLE", + "DIM_VARIANT", + "create_genotype_call_dataset", +] diff --git a/sgkit/api.py b/sgkit/api.py new file mode 100644 index 000000000..708209b4d --- /dev/null +++ b/sgkit/api.py @@ -0,0 +1,70 @@ +from typing import Any, Hashable, Mapping + +import xarray as xr + +from .utils import check_array_like + +DIM_VARIANT = "variant" +DIM_SAMPLE = "sample" +DIM_PLOIDY = "ploidy" +DIM_ALLELE = "allele" +DIM_GENOTYPE = "genotype" + + +def create_genotype_call_dataset( + variant_contig: Any, + variant_pos: Any, + variant_alleles: Any, + sample_id: Any, + call_gt: Any, + call_gt_phased: Any = None, + variant_id: Any = None, +) -> xr.Dataset: + """Create a dataset of genotype calls. + + Parameters + ---------- + variant_contig : array_like, np.int8 + The (index of the) contig for each variant. + variant_pos : array_like, np.int32 + The reference position of the variant. + variant_alleles : array_like, S1 + The possible alleles for the variant. + sample_id : array_like, str + The unique identifier of the sample. + call_gt : array_like, np.int8 + Genotype, encoded as allele values (0 for the reference, 1 for + the first allele, 2 for the second allele), or -1 to indicate a + missing value. + call_gt_phased : array_like, bool, optional + A flag for each call indicating if it is phased or not. If + omitted all calls are unphased. + variant_id: array_like, str, optional + The unique identifier of the variant. + + Returns + ------- + xr.Dataset + The dataset of genotype calls. + + """ + check_array_like(variant_contig, kind="i", ndim=1) + check_array_like(variant_pos, kind="i", ndim=1) + check_array_like(variant_alleles, kind="S", ndim=2) + check_array_like(sample_id, kind="U", ndim=1) + check_array_like(call_gt, kind="i", ndim=3) + data_vars: Mapping[Hashable, Any] = { + "variant/CONTIG": ([DIM_VARIANT], variant_contig), + "variant/POS": ([DIM_VARIANT], variant_pos), + "variant/ALLELES": ([DIM_VARIANT, DIM_ALLELE], variant_alleles), + "sample/ID": ([DIM_SAMPLE], sample_id), + "call/GT": ([DIM_VARIANT, DIM_SAMPLE, DIM_PLOIDY], call_gt), + "call/GT_mask": ([DIM_VARIANT, DIM_SAMPLE, DIM_PLOIDY], call_gt < 0), + } + if call_gt_phased is not None: + check_array_like(call_gt_phased, kind="b", ndim=2) + data_vars["call/GT_phased"] = ([DIM_VARIANT, DIM_SAMPLE], call_gt_phased) + if variant_id is not None: + check_array_like(variant_id, kind="U", ndim=1) + data_vars["variant/ID"] = ([DIM_VARIANT], variant_id) + return xr.Dataset(data_vars) diff --git a/sgkit/core.py b/sgkit/core.py deleted file mode 100644 index b34368b42..000000000 --- a/sgkit/core.py +++ /dev/null @@ -1,7 +0,0 @@ -from typing import Any - -import xarray as xr - - -def create_genotype_call_dataset(arr: Any) -> xr.Dataset: - return xr.Dataset({"data": (["variant", "sample", "genotype"], arr)}) diff --git a/sgkit/tests/test_api.py b/sgkit/tests/test_api.py new file mode 100644 index 000000000..4768ec7bd --- /dev/null +++ b/sgkit/tests/test_api.py @@ -0,0 +1,45 @@ +import numpy as np +from numpy.testing import assert_array_equal + +from sgkit import ( + DIM_ALLELE, + DIM_PLOIDY, + DIM_SAMPLE, + DIM_VARIANT, + create_genotype_call_dataset, +) + + +def test_create_genotype_call_dataset(): + variant_contig = np.array([0, 0], dtype="i1") + variant_pos = np.array([1000, 2000], dtype="i4") + variant_alleles = np.array([["A", "C"], ["G", "A"]], dtype="S1") + variant_id = np.array(["rs1", "rs2"], dtype=str) + sample_id = np.array(["sample_1", "sample_2", "sample_3"], dtype=str) + call_gt = np.array( + [[[0, 0], [0, 1], [1, 0]], [[-1, 0], [0, -1], [-1, -1]]], dtype="i1" + ) + call_gt_phased = np.array([[True, True, False], [True, False, False]], dtype=bool) + ds = create_genotype_call_dataset( + variant_contig, + variant_pos, + variant_alleles, + sample_id, + call_gt, + call_gt_phased=call_gt_phased, + variant_id=variant_id, + ) + + assert DIM_VARIANT in ds.dims + assert DIM_SAMPLE in ds.dims + assert DIM_PLOIDY in ds.dims + assert DIM_ALLELE in ds.dims + + assert_array_equal(ds["variant/CONTIG"], variant_contig) + assert_array_equal(ds["variant/POS"], variant_pos) + assert_array_equal(ds["variant/ALLELES"], variant_alleles) + assert_array_equal(ds["variant/ID"], variant_id) + assert_array_equal(ds["sample/ID"], sample_id) + assert_array_equal(ds["call/GT"], call_gt) + assert_array_equal(ds["call/GT_mask"], call_gt < 0) + assert_array_equal(ds["call/GT_phased"], call_gt_phased) diff --git a/sgkit/tests/test_core.py b/sgkit/tests/test_core.py deleted file mode 100644 index fc622aaa4..000000000 --- a/sgkit/tests/test_core.py +++ /dev/null @@ -1,10 +0,0 @@ -import numpy as np - -from sgkit import create_genotype_call_dataset - - -def test_create_genotype_call_dataset(): - arr = np.random.rand(4, 5, 3) # NB: not valid genotype data! - ds = create_genotype_call_dataset(arr) - assert "sample" in ds.dims - assert "variant" in ds.dims diff --git a/sgkit/tests/test_utils.py b/sgkit/tests/test_utils.py new file mode 100644 index 000000000..4688619d2 --- /dev/null +++ b/sgkit/tests/test_utils.py @@ -0,0 +1,20 @@ +import numpy as np +import pytest + +from sgkit.utils import check_array_like + + +def test_check_array_like(): + with pytest.raises(TypeError): + check_array_like("foo") + a = np.arange(100, dtype="i4") + with pytest.raises(TypeError): + check_array_like(a, dtype="i8") + with pytest.raises(TypeError): + check_array_like(a, dtype={"i1", "i2"}) + with pytest.raises(TypeError): + check_array_like(a, kind="f") + with pytest.raises(ValueError): + check_array_like(a, ndim=2) + with pytest.raises(ValueError): + check_array_like(a, ndim={2, 3}) diff --git a/sgkit/utils.py b/sgkit/utils.py new file mode 100644 index 000000000..47d2245a9 --- /dev/null +++ b/sgkit/utils.py @@ -0,0 +1,24 @@ +import numpy as np + + +def check_array_like(a, dtype=None, kind=None, ndim=None): + array_attrs = "ndim", "dtype", "shape" + for k in array_attrs: + if not hasattr(a, k): + raise TypeError + if dtype is not None: + if isinstance(dtype, set): + dtype = {np.dtype(t) for t in dtype} + if a.dtype not in dtype: + raise TypeError + elif a.dtype != np.dtype(dtype): + raise TypeError + if kind is not None: + if a.dtype.kind not in kind: + raise TypeError + if ndim is not None: + if isinstance(ndim, set): + if a.ndim not in ndim: + raise ValueError + elif ndim != a.ndim: + raise ValueError From 88dbce3e075d11797cf4c76c159a5f3b512463a3 Mon Sep 17 00:00:00 2001 From: Tom White Date: Tue, 30 Jun 2020 17:03:15 +0100 Subject: [PATCH 02/20] Loosen integer array types in documentation. --- sgkit/api.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sgkit/api.py b/sgkit/api.py index 708209b4d..b5aa7e3ea 100644 --- a/sgkit/api.py +++ b/sgkit/api.py @@ -24,15 +24,15 @@ def create_genotype_call_dataset( Parameters ---------- - variant_contig : array_like, np.int8 + variant_contig : array_like, int The (index of the) contig for each variant. - variant_pos : array_like, np.int32 + variant_pos : array_like, int The reference position of the variant. variant_alleles : array_like, S1 The possible alleles for the variant. sample_id : array_like, str The unique identifier of the sample. - call_gt : array_like, np.int8 + call_gt : array_like, int Genotype, encoded as allele values (0 for the reference, 1 for the first allele, 2 for the second allele), or -1 to indicate a missing value. From a5bca262715d457f8e85e9b34e667ceb0a0b6085 Mon Sep 17 00:00:00 2001 From: Tom White Date: Tue, 30 Jun 2020 17:42:00 +0100 Subject: [PATCH 03/20] Store contig names as an attribute --- sgkit/api.py | 8 ++++++-- sgkit/tests/test_api.py | 3 +++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/sgkit/api.py b/sgkit/api.py index b5aa7e3ea..e0220eed2 100644 --- a/sgkit/api.py +++ b/sgkit/api.py @@ -1,4 +1,4 @@ -from typing import Any, Hashable, Mapping +from typing import Any, Hashable, List, Mapping import xarray as xr @@ -12,6 +12,7 @@ def create_genotype_call_dataset( + variant_contig_names: List[str], variant_contig: Any, variant_pos: Any, variant_alleles: Any, @@ -24,6 +25,8 @@ def create_genotype_call_dataset( Parameters ---------- + variant_contig_names : list of str + The contig names. variant_contig : array_like, int The (index of the) contig for each variant. variant_pos : array_like, int @@ -67,4 +70,5 @@ def create_genotype_call_dataset( if variant_id is not None: check_array_like(variant_id, kind="U", ndim=1) data_vars["variant/ID"] = ([DIM_VARIANT], variant_id) - return xr.Dataset(data_vars) + attrs = {"contigs": variant_contig_names} + return xr.Dataset(data_vars=data_vars, attrs=attrs) diff --git a/sgkit/tests/test_api.py b/sgkit/tests/test_api.py index 4768ec7bd..9ea2af739 100644 --- a/sgkit/tests/test_api.py +++ b/sgkit/tests/test_api.py @@ -11,6 +11,7 @@ def test_create_genotype_call_dataset(): + variant_contig_names = ["chr1"] variant_contig = np.array([0, 0], dtype="i1") variant_pos = np.array([1000, 2000], dtype="i4") variant_alleles = np.array([["A", "C"], ["G", "A"]], dtype="S1") @@ -21,6 +22,7 @@ def test_create_genotype_call_dataset(): ) call_gt_phased = np.array([[True, True, False], [True, False, False]], dtype=bool) ds = create_genotype_call_dataset( + variant_contig_names, variant_contig, variant_pos, variant_alleles, @@ -35,6 +37,7 @@ def test_create_genotype_call_dataset(): assert DIM_PLOIDY in ds.dims assert DIM_ALLELE in ds.dims + assert ds.attrs["contigs"] == variant_contig_names assert_array_equal(ds["variant/CONTIG"], variant_contig) assert_array_equal(ds["variant/POS"], variant_pos) assert_array_equal(ds["variant/ALLELES"], variant_alleles) From b34b2708bca67b3f80222967827c1f9fce6753b0 Mon Sep 17 00:00:00 2001 From: Tom White Date: Wed, 1 Jul 2020 10:16:32 +0100 Subject: [PATCH 04/20] Move away from ALL_CAPS names. Pluralise dimension names. --- sgkit/api.py | 50 +++++++++++++++++++++++------------------ sgkit/tests/test_api.py | 30 +++++++++++++------------ 2 files changed, 44 insertions(+), 36 deletions(-) diff --git a/sgkit/api.py b/sgkit/api.py index e0220eed2..84c1b4847 100644 --- a/sgkit/api.py +++ b/sgkit/api.py @@ -4,21 +4,21 @@ from .utils import check_array_like -DIM_VARIANT = "variant" -DIM_SAMPLE = "sample" +DIM_VARIANT = "variants" +DIM_SAMPLE = "samples" DIM_PLOIDY = "ploidy" -DIM_ALLELE = "allele" -DIM_GENOTYPE = "genotype" +DIM_ALLELE = "alleles" +DIM_GENOTYPE = "genotypes" def create_genotype_call_dataset( variant_contig_names: List[str], variant_contig: Any, - variant_pos: Any, + variant_position: Any, variant_alleles: Any, sample_id: Any, - call_gt: Any, - call_gt_phased: Any = None, + call_genotype: Any, + call_genotype_phased: Any = None, variant_id: Any = None, ) -> xr.Dataset: """Create a dataset of genotype calls. @@ -29,17 +29,17 @@ def create_genotype_call_dataset( The contig names. variant_contig : array_like, int The (index of the) contig for each variant. - variant_pos : array_like, int + variant_position : array_like, int The reference position of the variant. variant_alleles : array_like, S1 The possible alleles for the variant. sample_id : array_like, str The unique identifier of the sample. - call_gt : array_like, int + call_genotype : array_like, int Genotype, encoded as allele values (0 for the reference, 1 for the first allele, 2 for the second allele), or -1 to indicate a missing value. - call_gt_phased : array_like, bool, optional + call_genotype_phased : array_like, bool, optional A flag for each call indicating if it is phased or not. If omitted all calls are unphased. variant_id: array_like, str, optional @@ -52,23 +52,29 @@ def create_genotype_call_dataset( """ check_array_like(variant_contig, kind="i", ndim=1) - check_array_like(variant_pos, kind="i", ndim=1) + check_array_like(variant_position, kind="i", ndim=1) check_array_like(variant_alleles, kind="S", ndim=2) check_array_like(sample_id, kind="U", ndim=1) - check_array_like(call_gt, kind="i", ndim=3) + check_array_like(call_genotype, kind="i", ndim=3) data_vars: Mapping[Hashable, Any] = { - "variant/CONTIG": ([DIM_VARIANT], variant_contig), - "variant/POS": ([DIM_VARIANT], variant_pos), - "variant/ALLELES": ([DIM_VARIANT, DIM_ALLELE], variant_alleles), - "sample/ID": ([DIM_SAMPLE], sample_id), - "call/GT": ([DIM_VARIANT, DIM_SAMPLE, DIM_PLOIDY], call_gt), - "call/GT_mask": ([DIM_VARIANT, DIM_SAMPLE, DIM_PLOIDY], call_gt < 0), + "variant/contig": ([DIM_VARIANT], variant_contig), + "variant/position": ([DIM_VARIANT], variant_position), + "variant/alleles": ([DIM_VARIANT, DIM_ALLELE], variant_alleles), + "sample/id": ([DIM_SAMPLE], sample_id), + "call/genotype": ([DIM_VARIANT, DIM_SAMPLE, DIM_PLOIDY], call_genotype), + "call/genotype_mask": ( + [DIM_VARIANT, DIM_SAMPLE, DIM_PLOIDY], + call_genotype < 0, + ), } - if call_gt_phased is not None: - check_array_like(call_gt_phased, kind="b", ndim=2) - data_vars["call/GT_phased"] = ([DIM_VARIANT, DIM_SAMPLE], call_gt_phased) + if call_genotype_phased is not None: + check_array_like(call_genotype_phased, kind="b", ndim=2) + data_vars["call/genotype_phased"] = ( + [DIM_VARIANT, DIM_SAMPLE], + call_genotype_phased, + ) if variant_id is not None: check_array_like(variant_id, kind="U", ndim=1) - data_vars["variant/ID"] = ([DIM_VARIANT], variant_id) + data_vars["variant/id"] = ([DIM_VARIANT], variant_id) attrs = {"contigs": variant_contig_names} return xr.Dataset(data_vars=data_vars, attrs=attrs) diff --git a/sgkit/tests/test_api.py b/sgkit/tests/test_api.py index 9ea2af739..9114a9554 100644 --- a/sgkit/tests/test_api.py +++ b/sgkit/tests/test_api.py @@ -13,22 +13,24 @@ def test_create_genotype_call_dataset(): variant_contig_names = ["chr1"] variant_contig = np.array([0, 0], dtype="i1") - variant_pos = np.array([1000, 2000], dtype="i4") + variant_position = np.array([1000, 2000], dtype="i4") variant_alleles = np.array([["A", "C"], ["G", "A"]], dtype="S1") variant_id = np.array(["rs1", "rs2"], dtype=str) sample_id = np.array(["sample_1", "sample_2", "sample_3"], dtype=str) - call_gt = np.array( + call_genotype = np.array( [[[0, 0], [0, 1], [1, 0]], [[-1, 0], [0, -1], [-1, -1]]], dtype="i1" ) - call_gt_phased = np.array([[True, True, False], [True, False, False]], dtype=bool) + call_genotype_phased = np.array( + [[True, True, False], [True, False, False]], dtype=bool + ) ds = create_genotype_call_dataset( variant_contig_names, variant_contig, - variant_pos, + variant_position, variant_alleles, sample_id, - call_gt, - call_gt_phased=call_gt_phased, + call_genotype, + call_genotype_phased=call_genotype_phased, variant_id=variant_id, ) @@ -38,11 +40,11 @@ def test_create_genotype_call_dataset(): assert DIM_ALLELE in ds.dims assert ds.attrs["contigs"] == variant_contig_names - assert_array_equal(ds["variant/CONTIG"], variant_contig) - assert_array_equal(ds["variant/POS"], variant_pos) - assert_array_equal(ds["variant/ALLELES"], variant_alleles) - assert_array_equal(ds["variant/ID"], variant_id) - assert_array_equal(ds["sample/ID"], sample_id) - assert_array_equal(ds["call/GT"], call_gt) - assert_array_equal(ds["call/GT_mask"], call_gt < 0) - assert_array_equal(ds["call/GT_phased"], call_gt_phased) + assert_array_equal(ds["variant/contig"], variant_contig) + assert_array_equal(ds["variant/position"], variant_position) + assert_array_equal(ds["variant/alleles"], variant_alleles) + assert_array_equal(ds["variant/id"], variant_id) + assert_array_equal(ds["sample/id"], sample_id) + assert_array_equal(ds["call/genotype"], call_genotype) + assert_array_equal(ds["call/genotype_mask"], call_genotype < 0) + assert_array_equal(ds["call/genotype_phased"], call_genotype_phased) From 4aecff1db800aea2b85f1d194555968f9b4314ea Mon Sep 17 00:00:00 2001 From: eric-czech Date: Fri, 3 Jul 2020 23:42:51 +0000 Subject: [PATCH 05/20] Initial single continuous phenotype regression implementation --- setup.cfg | 2 +- sgkit/stats/__init__.py | 0 sgkit/stats/gwas.py | 88 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 sgkit/stats/__init__.py create mode 100644 sgkit/stats/gwas.py diff --git a/setup.cfg b/setup.cfg index dc5245e1a..35392d97e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -50,7 +50,7 @@ ignore = [isort] default_section = THIRDPARTY known_first_party = sgkit -known_third_party = numpy,setuptools,xarray +known_third_party = dask,numpy,pytest,setuptools,xarray multi_line_output = 3 include_trailing_comma = True force_grid_wrap = 0 diff --git a/sgkit/stats/__init__.py b/sgkit/stats/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/sgkit/stats/gwas.py b/sgkit/stats/gwas.py new file mode 100644 index 000000000..72e7a4435 --- /dev/null +++ b/sgkit/stats/gwas.py @@ -0,0 +1,88 @@ +import collections +from typing import Sequence + +import dask.array as da +import numpy as np +from dask.array import stats +from xarray import Dataset + +from ..api import DIM_PLOIDY, DIM_VARIANT + +LinearRegressionResult = collections.namedtuple( + "LinearRegressionResult", ["betas", "t_values", "p_values"] +) + + +def _linear_regression(X, Z, y) -> LinearRegressionResult: + """Efficient linear regression estimation for multiple covariate sets + + Parameters + ---------- + X : (M, N) array-like + "Loop" covariates for which a separate regression will be fit to + individual columns + Z : (M, P) array-like + "Core" covariates that are included in the regressions along + with each loop covariate + y : (M,) + Continuous outcome + + Returns + ------- + tuple + [description] + """ + # Apply orthogonal projection to eliminate core covariates + print("Z", Z) + print("X", X) + print("y", y) + Xp = X - Z @ da.linalg.lstsq(Z, X)[0] + yp = y - Z @ da.linalg.lstsq(Z, y)[0] + + # Estimate coefficients for each loop covariate + Xps = (Xp ** 2).sum(axis=0) + b = (Xp.T @ yp) / Xps + + # Compute statistics and p values for each regression separately + dof = y.shape[0] - Z.shape[1] - 1 + y_resid = yp[:, np.newaxis] - Xp * b + rss = (y_resid ** 2).sum(axis=0) + t_val = b / np.sqrt((rss / dof) / Xps) + p_val = 2 * stats.distributions.t.sf(np.abs(t_val), dof) + + return LinearRegressionResult(betas=b, t_values=t_val, p_values=p_val) + + +def _get_loop_covariates(ds: Dataset, dosage: str = None): + if dosage is None: + X = ds["call/genotype"].sum(dim=DIM_PLOIDY) + else: + X = ds[dosage] + return da.asarray(X.data) + + +def _get_core_covariates( + ds: Dataset, covariates: Sequence[str], add_intercept: bool = False +): + X = da.stack([da.asarray(ds[c].data) for c in covariates]).T + if add_intercept: + X = da.concatenate([da.ones((X.shape[0], 1)), X], axis=1) + return X.rechunk((-1, -1)) + + +def linear_regression( + ds: Dataset, + covariates: Sequence[str], + dosage: str, + trait: str, + add_intercept: bool = True, +): + X = _get_loop_covariates(ds, dosage=dosage) + Z = _get_core_covariates(ds, covariates, add_intercept=add_intercept) + y = da.asarray(ds[trait].data) + res = _linear_regression(X.T, Z, y) + return ds.assign( + betas=(DIM_VARIANT, res.betas), + t_values=(DIM_VARIANT, res.t_values), + p_values=(DIM_VARIANT, res.p_values), + ) From 9ff934d48af2a50d8a6746fbc213b7009a9534b2 Mon Sep 17 00:00:00 2001 From: eric-czech Date: Sat, 4 Jul 2020 13:25:54 +0000 Subject: [PATCH 06/20] Basic regression validation on simulated data --- setup.cfg | 2 +- sgkit/stats/association.py | 138 ++++++++++++++++++++++++++++++++ sgkit/stats/gwas.py | 88 -------------------- sgkit/tests/test_association.py | 108 +++++++++++++++++++++++++ 4 files changed, 247 insertions(+), 89 deletions(-) create mode 100644 sgkit/stats/association.py delete mode 100644 sgkit/stats/gwas.py create mode 100644 sgkit/tests/test_association.py diff --git a/setup.cfg b/setup.cfg index 35392d97e..3d335c6b4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -50,7 +50,7 @@ ignore = [isort] default_section = THIRDPARTY known_first_party = sgkit -known_third_party = dask,numpy,pytest,setuptools,xarray +known_third_party = dask,numpy,pandas,pytest,setuptools,xarray multi_line_output = 3 include_trailing_comma = True force_grid_wrap = 0 diff --git a/sgkit/stats/association.py b/sgkit/stats/association.py new file mode 100644 index 000000000..d84ffe82b --- /dev/null +++ b/sgkit/stats/association.py @@ -0,0 +1,138 @@ +import collections +from typing import Sequence + +import dask.array as da +import numpy as np +import xarray as xr +from dask.array import stats +from xarray import Dataset + +from ..api import DIM_PLOIDY, DIM_VARIANT + +LinearRegressionResult = collections.namedtuple( + "LinearRegressionResult", ["betas", "t_values", "p_values"] +) + + +def _linear_regression(G, X, y) -> LinearRegressionResult: + """Efficient linear regression estimation for multiple covariate sets + + Parameters + ---------- + G : (M, N) array-like + "Loop" covariates for which a separate regression will be fit to + individual columns + X : (M, P) array-like + "Core" covariates that are included in the regressions along + with each loop covariate + y : (M,) + Continuous outcome + + Returns + ------- + tuple + [description] + """ + G, X = da.asarray(G), da.asarray(X) + + # Apply orthogonal projection to eliminate core covariates + # Note: dask lstsq will not work with numpy arrays + Gp = G - X @ da.linalg.lstsq(X, G)[0] + yp = y - X @ da.linalg.lstsq(X, y)[0] + + # Estimate coefficients for each loop covariate + # Note: a key assumption here is that 0-mean residuals + # from projection require no extra terms in variance + # estimate for loop covariates (columns of G), which is + # only true when an intercept is present. + Gps = (Gp ** 2).sum(axis=0) + b = (Gp.T @ yp) / Gps + + # Compute statistics and p values for each regression separately + # Note: dof w/ -2 includes loop covariate + dof = y.shape[0] - X.shape[1] - 2 + y_resid = yp[:, np.newaxis] - Gp * b + rss = (y_resid ** 2).sum(axis=0) + t_val = b / np.sqrt((rss / dof) / Gps) + p_val = 2 * stats.distributions.t.sf(np.abs(t_val), dof) + + return LinearRegressionResult(betas=b, t_values=t_val, p_values=p_val) + + +def _get_loop_covariates(ds: Dataset, dosage: str = None): + if dosage is None: + G = ds["call/genotype"].sum(dim=DIM_PLOIDY) + else: + G = ds[dosage] + return da.asarray(G.data) + + +def _get_core_covariates( + ds: Dataset, covariates: Sequence[str], add_intercept: bool = False +): + if not add_intercept and not covariates: + raise ValueError( + "At least one covariate must be provided when `add_intercept`=False" + ) + X = da.stack([da.asarray(ds[c].data) for c in covariates]).T + if add_intercept: + X = da.concatenate([da.ones((X.shape[0], 1)), X], axis=1) + # Note: dask qr decomp (used by lstsq) requires no chunking in one + # dimension, and because dim 0 will be far greater than the number + # of covariates for the large majority of use cases, chunking + # should be removed from dim 1 + return X.rechunk((None, -1)) + + +def linear_regression( + ds: Dataset, + covariates: Sequence[str], + dosage: str, + trait: str, + add_intercept: bool = True, +): + """Run linear regression to identify trait associations with genetic variation data + + TODO: Explain vectorization scheme and add references + + Warning: Regression statistics from this implementation are only valid when an intercept + is present. The `add_intercept` flag is a convenience for adding one when not + already present, but there is currently no parameterization for intercept-free regression. + + Parameters + ---------- + ds : Dataset + Dataset containing necessary dependent and independent variables + covariates : Sequence[str] + Covariate variable names + dosage : str + Dosage variable name where "dosage" array can contain represent + one of several possible quantities, e.g.: + - Alternate allele counts + - Recessive or dominant allele encodings + - True dosages as computed from imputed or probabilistic variant calls + trait : str + Trait (e.g. phenotype) variable name; must be continuous + add_intercept : bool, optional + Add intercept term to covariate set, by default True. + + Returns + ------- + Dataset + Regression result containing: + - betas: beta values associated with each independent variant regressed + against the trait + - t_values: T-test statistic for beta estimate + - p_values: P-value for beta estimate (float in [0, 1]) + """ + G = _get_loop_covariates(ds, dosage=dosage) + Z = _get_core_covariates(ds, covariates, add_intercept=add_intercept) + y = da.asarray(ds[trait].data) + res = _linear_regression(G.T, Z, y) + return xr.Dataset( + dict( + betas=(DIM_VARIANT, res.betas), + t_values=(DIM_VARIANT, res.t_values), + p_values=(DIM_VARIANT, res.p_values), + ) + ) diff --git a/sgkit/stats/gwas.py b/sgkit/stats/gwas.py deleted file mode 100644 index 72e7a4435..000000000 --- a/sgkit/stats/gwas.py +++ /dev/null @@ -1,88 +0,0 @@ -import collections -from typing import Sequence - -import dask.array as da -import numpy as np -from dask.array import stats -from xarray import Dataset - -from ..api import DIM_PLOIDY, DIM_VARIANT - -LinearRegressionResult = collections.namedtuple( - "LinearRegressionResult", ["betas", "t_values", "p_values"] -) - - -def _linear_regression(X, Z, y) -> LinearRegressionResult: - """Efficient linear regression estimation for multiple covariate sets - - Parameters - ---------- - X : (M, N) array-like - "Loop" covariates for which a separate regression will be fit to - individual columns - Z : (M, P) array-like - "Core" covariates that are included in the regressions along - with each loop covariate - y : (M,) - Continuous outcome - - Returns - ------- - tuple - [description] - """ - # Apply orthogonal projection to eliminate core covariates - print("Z", Z) - print("X", X) - print("y", y) - Xp = X - Z @ da.linalg.lstsq(Z, X)[0] - yp = y - Z @ da.linalg.lstsq(Z, y)[0] - - # Estimate coefficients for each loop covariate - Xps = (Xp ** 2).sum(axis=0) - b = (Xp.T @ yp) / Xps - - # Compute statistics and p values for each regression separately - dof = y.shape[0] - Z.shape[1] - 1 - y_resid = yp[:, np.newaxis] - Xp * b - rss = (y_resid ** 2).sum(axis=0) - t_val = b / np.sqrt((rss / dof) / Xps) - p_val = 2 * stats.distributions.t.sf(np.abs(t_val), dof) - - return LinearRegressionResult(betas=b, t_values=t_val, p_values=p_val) - - -def _get_loop_covariates(ds: Dataset, dosage: str = None): - if dosage is None: - X = ds["call/genotype"].sum(dim=DIM_PLOIDY) - else: - X = ds[dosage] - return da.asarray(X.data) - - -def _get_core_covariates( - ds: Dataset, covariates: Sequence[str], add_intercept: bool = False -): - X = da.stack([da.asarray(ds[c].data) for c in covariates]).T - if add_intercept: - X = da.concatenate([da.ones((X.shape[0], 1)), X], axis=1) - return X.rechunk((-1, -1)) - - -def linear_regression( - ds: Dataset, - covariates: Sequence[str], - dosage: str, - trait: str, - add_intercept: bool = True, -): - X = _get_loop_covariates(ds, dosage=dosage) - Z = _get_core_covariates(ds, covariates, add_intercept=add_intercept) - y = da.asarray(ds[trait].data) - res = _linear_regression(X.T, Z, y) - return ds.assign( - betas=(DIM_VARIANT, res.betas), - t_values=(DIM_VARIANT, res.t_values), - p_values=(DIM_VARIANT, res.p_values), - ) diff --git a/sgkit/tests/test_association.py b/sgkit/tests/test_association.py new file mode 100644 index 000000000..8415253b8 --- /dev/null +++ b/sgkit/tests/test_association.py @@ -0,0 +1,108 @@ +import numpy as np +import pandas as pd +import pytest +import xarray as xr + +from sgkit.stats.association import linear_regression + + +def _generate_test_data(n=100, m=10, p=3, e_std=0.001, b_zero_slice=None, seed=None): + """Test data simulator for multiple variant associations to a continuous outcome + + Outcomes for each variant are simulated separately based on linear combinations + of randomly generated fixed effect covariates as well as the variant itself. + + This does not add an intercept term in covariates. + + Parameters + ---------- + n : int, optional + Number of samples + m : int, optional + Number of variants + p : int, optional + Number of covariates + e_std : float, optional + Standard deviation for noise term + b_zero_slice : slice + Variant beta values to zero out, defaults to `slice(m // 2)` + meaning that the first half will all be 0. + Set to `slice(0)` to disable. + + Returns + ------- + n : int + Number of samples + m : int + Number of variants + p : int + Number of covariates + g : (n, m) array-like + Simulated genotype dosage + x : (n, p) array-like + Simulated covariates + bg : (m,) array-like + Variant betas + bx : (p,) array-like + Covariate betas + ys : (m, n) array-like + Outcomes for each column in genotypes i.e. variant + """ + if b_zero_slice is None: + b_zero_slice = slice(m // 2) + np.random.seed(seed) + g = np.random.uniform(size=(n, m), low=0, high=2) + x = np.random.normal(size=(n, p)) + bg = np.random.normal(size=m) + bg[b_zero_slice or slice(m // 2)] = 0 + bx = np.random.normal(size=p) + e = np.random.normal(size=n, scale=e_std) + + # Simulate y values using each variant independently + ys = np.array([g[:, i] * bg[i] + x @ bx + e for i in range(m)]) + return n, m, p, g, x, bg, bx, ys + + +def _generate_test_dataset(**kwargs): + n, m, p, g, x, bg, bx, ys = _generate_test_data(**kwargs) + data_vars = {} + # TODO: use literals or constants for dimension names? + data_vars["dosage"] = (["variant", "sample"], g.T) + for i in range(x.shape[1]): + data_vars[f"covar_{i}"] = (["sample"], x[:, i]) + for i in range(len(ys)): + data_vars[f"trait_{i}"] = (["sample"], ys[i]) + attrs = dict(betas=bg) + return xr.Dataset(data_vars, attrs=attrs) + + +@pytest.fixture +def ds(): + return _generate_test_dataset() + + +def get_statistics(ds, **kwargs): + dfr = [] + for i in range(ds.dims["variant"]): + dsr = linear_regression(ds, dosage="dosage", trait=f"trait_{i}", **kwargs) + dfr.append(dsr.to_dataframe().iloc[[i]]) + return pd.concat(dfr).reset_index(drop=True) + + +def test_linear_regression_statistics(ds): + df = get_statistics( + ds, covariates=["covar_0", "covar_1", "covar_2"], add_intercept=True + ) + # TODO: should we standardize on printing useful debugging info in the event of failures? + print(df) + np.testing.assert_allclose(df["betas"], ds.attrs["betas"], atol=1e-3) + mid_idx = ds.dims["variant"] // 2 + assert np.all(df["p_values"].iloc[:mid_idx] > 0.05) + assert np.all(df["p_values"].iloc[mid_idx:] < 0.05) + + +def test_linear_regression_raise_on_no_covars(ds): + with pytest.raises(ValueError): + linear_regression( + ds, covariates=[], dosage="dosage", trait="trait_0", add_intercept=False + ) From 2b83a9ff1d196979702c822fe548222f03a994a9 Mon Sep 17 00:00:00 2001 From: eric-czech Date: Sat, 4 Jul 2020 13:30:40 +0000 Subject: [PATCH 07/20] Adding dask dependency --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 6fefda50d..d3fb40fc9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ numpy xarray +dask From 7806732b6b26c0893c84fdd32d2ec35a3b4cd76a Mon Sep 17 00:00:00 2001 From: eric-czech Date: Sat, 4 Jul 2020 13:32:41 +0000 Subject: [PATCH 08/20] Adding dask dependency --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d3fb40fc9..8909d53ea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ numpy xarray -dask +dask[array] From 2dc6f9197d203658b06998c9ca3128b041de05f2 Mon Sep 17 00:00:00 2001 From: eric-czech Date: Sat, 4 Jul 2020 13:34:08 +0000 Subject: [PATCH 09/20] Adding scipy dependency --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 8909d53ea..894396a41 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ numpy xarray dask[array] +scipy From 43c163ee84beb5fc80be4907712e5e3de9315dc9 Mon Sep 17 00:00:00 2001 From: eric-czech Date: Sat, 4 Jul 2020 18:43:17 +0000 Subject: [PATCH 10/20] Adding statsmodels validation to tests --- README.md | 2 +- requirements-dev.txt | 1 + setup.cfg | 2 +- sgkit/stats/association.py | 35 +++++++++-------- sgkit/tests/test_association.py | 67 ++++++++++++++++++++++++++------- 5 files changed, 77 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 14e4bad35..194aae428 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ pytest To check code coverage and get a coverage report, run ```bash -pytest --cov=sgkit +pytest --cov=sgkit --cov-report term-missing ``` ### Code standards diff --git a/requirements-dev.txt b/requirements-dev.txt index afc8cb84e..f3411c4c3 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -5,3 +5,4 @@ mypy pre-commit pytest pytest-cov +statsmodels diff --git a/setup.cfg b/setup.cfg index 3d335c6b4..700ae0930 100644 --- a/setup.cfg +++ b/setup.cfg @@ -50,7 +50,7 @@ ignore = [isort] default_section = THIRDPARTY known_first_party = sgkit -known_third_party = dask,numpy,pandas,pytest,setuptools,xarray +known_third_party = dask,numpy,pandas,pytest,setuptools,statsmodels,xarray multi_line_output = 3 include_trailing_comma = True force_grid_wrap = 0 diff --git a/sgkit/stats/association.py b/sgkit/stats/association.py index d84ffe82b..730287bf6 100644 --- a/sgkit/stats/association.py +++ b/sgkit/stats/association.py @@ -10,7 +10,7 @@ from ..api import DIM_PLOIDY, DIM_VARIANT LinearRegressionResult = collections.namedtuple( - "LinearRegressionResult", ["betas", "t_values", "p_values"] + "LinearRegressionResult", ["beta", "t_value", "p_value"] ) @@ -30,13 +30,16 @@ def _linear_regression(G, X, y) -> LinearRegressionResult: Returns ------- - tuple - [description] + LinearRegressionResult + Regression statistics and coefficient estimates """ G, X = da.asarray(G), da.asarray(X) # Apply orthogonal projection to eliminate core covariates - # Note: dask lstsq will not work with numpy arrays + # Note: QR factorization or SVD should be used here to find + # what are effectively OLS residuals rather than matrix inverse + # to avoid need for MxM array; additionally, dask.lstsq will not + # work with numpy arrays Gp = G - X @ da.linalg.lstsq(X, G)[0] yp = y - X @ da.linalg.lstsq(X, y)[0] @@ -50,18 +53,20 @@ def _linear_regression(G, X, y) -> LinearRegressionResult: # Compute statistics and p values for each regression separately # Note: dof w/ -2 includes loop covariate - dof = y.shape[0] - X.shape[1] - 2 + dof = y.shape[0] - X.shape[1] - 1 y_resid = yp[:, np.newaxis] - Gp * b rss = (y_resid ** 2).sum(axis=0) t_val = b / np.sqrt((rss / dof) / Gps) p_val = 2 * stats.distributions.t.sf(np.abs(t_val), dof) - return LinearRegressionResult(betas=b, t_values=t_val, p_values=p_val) + return LinearRegressionResult(beta=b, t_value=t_val, p_value=p_val) def _get_loop_covariates(ds: Dataset, dosage: str = None): if dosage is None: - G = ds["call/genotype"].sum(dim=DIM_PLOIDY) + # TODO: This should be (probably gwas-specific) allele + # count with sex chromosome considerations + G = ds["call/genotype"].sum(dim=DIM_PLOIDY) # pragma: no cover else: G = ds[dosage] return da.asarray(G.data) @@ -112,18 +117,18 @@ def linear_regression( - Recessive or dominant allele encodings - True dosages as computed from imputed or probabilistic variant calls trait : str - Trait (e.g. phenotype) variable name; must be continuous + Trait (e.g. phenotype) variable name, must be continuous add_intercept : bool, optional - Add intercept term to covariate set, by default True. + Add intercept term to covariate set, by default True Returns ------- Dataset Regression result containing: - - betas: beta values associated with each independent variant regressed + - beta: beta values associated with each independent variant regressed against the trait - - t_values: T-test statistic for beta estimate - - p_values: P-value for beta estimate (float in [0, 1]) + - t_value: T-test statistic for beta estimate + - p_value: P-value for beta estimate (unscaled float in [0, 1]) """ G = _get_loop_covariates(ds, dosage=dosage) Z = _get_core_covariates(ds, covariates, add_intercept=add_intercept) @@ -131,8 +136,8 @@ def linear_regression( res = _linear_regression(G.T, Z, y) return xr.Dataset( dict( - betas=(DIM_VARIANT, res.betas), - t_values=(DIM_VARIANT, res.t_values), - p_values=(DIM_VARIANT, res.p_values), + beta=(DIM_VARIANT, res.beta), + t_value=(DIM_VARIANT, res.t_value), + p_value=(DIM_VARIANT, res.p_value), ) ) diff --git a/sgkit/tests/test_association.py b/sgkit/tests/test_association.py index 8415253b8..612820c04 100644 --- a/sgkit/tests/test_association.py +++ b/sgkit/tests/test_association.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd import pytest +import statsmodels.api as sm import xarray as xr from sgkit.stats.association import linear_regression @@ -72,7 +73,7 @@ def _generate_test_dataset(**kwargs): data_vars[f"covar_{i}"] = (["sample"], x[:, i]) for i in range(len(ys)): data_vars[f"trait_{i}"] = (["sample"], ys[i]) - attrs = dict(betas=bg) + attrs = dict(beta=bg) return xr.Dataset(data_vars, attrs=attrs) @@ -81,24 +82,64 @@ def ds(): return _generate_test_dataset() -def get_statistics(ds, **kwargs): - dfr = [] +def _sm_statistics(ds, i, add_intercept): + X = [] + # Make sure first independent variable is variant + X.append(ds["dosage"].values[i]) + for v in [c for c in list(ds.keys()) if c.startswith("covar_")]: + X.append(ds[v].values) + if add_intercept: + X.append(np.ones(ds.dims["sample"])) + X = np.stack(X).T + y = ds[f"trait_{i}"].values + + return sm.OLS(y, X, hasconst=True).fit() + + +def _get_statistics(ds, add_intercept, **kwargs): + df_pred, df_true = [], [] for i in range(ds.dims["variant"]): - dsr = linear_regression(ds, dosage="dosage", trait=f"trait_{i}", **kwargs) - dfr.append(dsr.to_dataframe().iloc[[i]]) - return pd.concat(dfr).reset_index(drop=True) + dsr = linear_regression( + ds, + dosage="dosage", + trait=f"trait_{i}", + add_intercept=add_intercept, + **kwargs, + ) + res = _sm_statistics(ds, i, add_intercept) + df_pred.append(dsr.to_dataframe().iloc[i].to_dict()) + df_true.append(dict(t_value=res.tvalues[0], p_value=res.pvalues[0])) + return pd.DataFrame(df_pred), pd.DataFrame(df_true) def test_linear_regression_statistics(ds): - df = get_statistics( + def validate(dfp, dft): + # TODO: should we standardize on printing useful debugging info in the event of failures? + print(dfp) + print(dft) + + # Validate results at a higher level, looking only for recapitulation + # of more obvious inferences based on how the data was simulated + np.testing.assert_allclose(dfp["beta"], ds.attrs["beta"], atol=1e-3) + mid_idx = ds.dims["variant"] // 2 + assert np.all(dfp["p_value"].iloc[:mid_idx] > 0.05) + assert np.all(dfp["p_value"].iloc[mid_idx:] < 0.05) + + # Validate more precisely against statsmodels results + np.testing.assert_allclose(dfp["t_value"], dft["t_value"]) + np.testing.assert_allclose(dfp["p_value"], dft["p_value"]) + + dfp, dft = _get_statistics( ds, covariates=["covar_0", "covar_1", "covar_2"], add_intercept=True ) - # TODO: should we standardize on printing useful debugging info in the event of failures? - print(df) - np.testing.assert_allclose(df["betas"], ds.attrs["betas"], atol=1e-3) - mid_idx = ds.dims["variant"] // 2 - assert np.all(df["p_values"].iloc[:mid_idx] > 0.05) - assert np.all(df["p_values"].iloc[mid_idx:] < 0.05) + validate(dfp, dft) + + dfp, dft = _get_statistics( + ds.assign(covar_3=("sample", np.ones(ds.dims["sample"]))), + covariates=["covar_0", "covar_1", "covar_2", "covar_3"], + add_intercept=False, + ) + validate(dfp, dft) def test_linear_regression_raise_on_no_covars(ds): From 3ca89dc0ab6b53c4e5b327fc02cd2394e996a6f9 Mon Sep 17 00:00:00 2001 From: eric-czech Date: Sun, 5 Jul 2020 11:52:16 +0000 Subject: [PATCH 11/20] Adding docs --- sgkit/stats/association.py | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/sgkit/stats/association.py b/sgkit/stats/association.py index 730287bf6..5cab2436f 100644 --- a/sgkit/stats/association.py +++ b/sgkit/stats/association.py @@ -44,7 +44,7 @@ def _linear_regression(G, X, y) -> LinearRegressionResult: yp = y - X @ da.linalg.lstsq(X, y)[0] # Estimate coefficients for each loop covariate - # Note: a key assumption here is that 0-mean residuals + # Note: A key assumption here is that 0-mean residuals # from projection require no extra terms in variance # estimate for loop covariates (columns of G), which is # only true when an intercept is present. @@ -52,11 +52,10 @@ def _linear_regression(G, X, y) -> LinearRegressionResult: b = (Gp.T @ yp) / Gps # Compute statistics and p values for each regression separately - # Note: dof w/ -2 includes loop covariate dof = y.shape[0] - X.shape[1] - 1 y_resid = yp[:, np.newaxis] - Gp * b rss = (y_resid ** 2).sum(axis=0) - t_val = b / np.sqrt((rss / dof) / Gps) + t_val = b / np.sqrt(rss / dof / Gps) p_val = 2 * stats.distributions.t.sf(np.abs(t_val), dof) return LinearRegressionResult(beta=b, t_value=t_val, p_value=p_val) @@ -96,13 +95,19 @@ def linear_regression( trait: str, add_intercept: bool = True, ): - """Run linear regression to identify trait associations with genetic variation data + """Run linear regression to identify continuous trait associations with genetic variants - TODO: Explain vectorization scheme and add references + This method solves OLS regressions for each variant simultaneously and reports + effect statistics as defined in [1]. This is facilitated by the removal of + sample (i.e. person/individual) covariates through orthogonal projection + of both the genetic variant and phenotype data [2]. A consequence of this + rotation is that effect sizes and significances cannot be reported for + covariates, only variants. - Warning: Regression statistics from this implementation are only valid when an intercept - is present. The `add_intercept` flag is a convenience for adding one when not - already present, but there is currently no parameterization for intercept-free regression. + Warning: Regression statistics from this implementation are only valid when an + intercept is present. The `add_intercept` flag is a convenience for adding one + when not already present, but there is currently no parameterization for + intercept-free regression. Parameters ---------- @@ -121,6 +126,15 @@ def linear_regression( add_intercept : bool, optional Add intercept term to covariate set, by default True + References + ---------- + - [1] Friedman, Jerome, Trevor Hastie, and Robert Tibshirani. 2001. The Elements + of Statistical Learning. Vol. 1. Springer series in statistics New York. + - [2] Loh, Po-Ru, George Tucker, Brendan K. Bulik-Sullivan, Bjarni J. Vilhjálmsson, + Hilary K. Finucane, Rany M. Salem, Daniel I. Chasman, et al. 2015. “Efficient + Bayesian Mixed-Model Analysis Increases Association Power in Large Cohorts.” + Nature Genetics 47 (3): 284–90. + Returns ------- Dataset From 461e1f6e05dd27f25ac3afa3aa4f06db2891fbb7 Mon Sep 17 00:00:00 2001 From: eric-czech Date: Sun, 5 Jul 2020 12:04:01 +0000 Subject: [PATCH 12/20] Switch to random state instance for simulated data --- sgkit/tests/test_association.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/sgkit/tests/test_association.py b/sgkit/tests/test_association.py index 612820c04..6ec80ae0c 100644 --- a/sgkit/tests/test_association.py +++ b/sgkit/tests/test_association.py @@ -7,7 +7,7 @@ from sgkit.stats.association import linear_regression -def _generate_test_data(n=100, m=10, p=3, e_std=0.001, b_zero_slice=None, seed=None): +def _generate_test_data(n=100, m=10, p=3, e_std=0.001, b_zero_slice=None, seed=1): """Test data simulator for multiple variant associations to a continuous outcome Outcomes for each variant are simulated separately based on linear combinations @@ -51,13 +51,13 @@ def _generate_test_data(n=100, m=10, p=3, e_std=0.001, b_zero_slice=None, seed=N """ if b_zero_slice is None: b_zero_slice = slice(m // 2) - np.random.seed(seed) - g = np.random.uniform(size=(n, m), low=0, high=2) - x = np.random.normal(size=(n, p)) - bg = np.random.normal(size=m) + rs = np.random.RandomState(seed) + g = rs.uniform(size=(n, m), low=0, high=2) + x = rs.normal(size=(n, p)) + bg = rs.normal(size=m) bg[b_zero_slice or slice(m // 2)] = 0 - bx = np.random.normal(size=p) - e = np.random.normal(size=n, scale=e_std) + bx = rs.normal(size=p) + e = rs.normal(size=n, scale=e_std) # Simulate y values using each variant independently ys = np.array([g[:, i] * bg[i] + x @ bx + e for i in range(m)]) @@ -114,7 +114,6 @@ def _get_statistics(ds, add_intercept, **kwargs): def test_linear_regression_statistics(ds): def validate(dfp, dft): - # TODO: should we standardize on printing useful debugging info in the event of failures? print(dfp) print(dft) From be2cfc578134ab8ccb45534d5cce60ef3e2bf3fc Mon Sep 17 00:00:00 2001 From: eric-czech Date: Sun, 5 Jul 2020 12:05:35 +0000 Subject: [PATCH 13/20] Adding coveragerc to remove tests from coverage check --- .coveragerc | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .coveragerc diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 000000000..083aa052e --- /dev/null +++ b/.coveragerc @@ -0,0 +1,3 @@ +[run] +omit = + sgkit/tests/* From 0627f5221577bc520416824fa245d290b236fe46 Mon Sep 17 00:00:00 2001 From: eric-czech Date: Sun, 5 Jul 2020 12:32:02 +0000 Subject: [PATCH 14/20] Update reference --- sgkit/stats/association.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sgkit/stats/association.py b/sgkit/stats/association.py index 5cab2436f..3c97aa3e6 100644 --- a/sgkit/stats/association.py +++ b/sgkit/stats/association.py @@ -128,8 +128,9 @@ def linear_regression( References ---------- - - [1] Friedman, Jerome, Trevor Hastie, and Robert Tibshirani. 2001. The Elements - of Statistical Learning. Vol. 1. Springer series in statistics New York. + - [1] Hastie, Trevor, Robert Tibshirani, and Jerome Friedman. 2009. The Elements + of Statistical Learning: Data Mining, Inference, and Prediction, Second Edition. + Springer Science & Business Media. - [2] Loh, Po-Ru, George Tucker, Brendan K. Bulik-Sullivan, Bjarni J. Vilhjálmsson, Hilary K. Finucane, Rany M. Salem, Daniel I. Chasman, et al. 2015. “Efficient Bayesian Mixed-Model Analysis Increases Association Power in Large Cohorts.” From c76c720b67913a72df9eddeabdad3d53f9ae0ce7 Mon Sep 17 00:00:00 2001 From: eric-czech Date: Mon, 6 Jul 2020 11:19:10 +0000 Subject: [PATCH 15/20] Add missing terms to coverage report in build --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 9c1f6fc8f..faa91b5fe 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -21,4 +21,4 @@ jobs: uses: pre-commit/action@v2.0.0 - name: Test with pytest and coverage run: | - pytest -v --cov=sgkit + pytest -v --cov=sgkit --cov-report term-missing From 505fba057aba015e691f0efbc61b2191bcf54e0d Mon Sep 17 00:00:00 2001 From: eric-czech Date: Mon, 6 Jul 2020 13:39:58 +0000 Subject: [PATCH 16/20] Return type annotations --- sgkit/stats/association.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sgkit/stats/association.py b/sgkit/stats/association.py index 3c97aa3e6..9db8b5e8a 100644 --- a/sgkit/stats/association.py +++ b/sgkit/stats/association.py @@ -4,7 +4,7 @@ import dask.array as da import numpy as np import xarray as xr -from dask.array import stats +from dask.array import Array, stats from xarray import Dataset from ..api import DIM_PLOIDY, DIM_VARIANT @@ -61,7 +61,7 @@ def _linear_regression(G, X, y) -> LinearRegressionResult: return LinearRegressionResult(beta=b, t_value=t_val, p_value=p_val) -def _get_loop_covariates(ds: Dataset, dosage: str = None): +def _get_loop_covariates(ds: Dataset, dosage: str = None) -> Array: if dosage is None: # TODO: This should be (probably gwas-specific) allele # count with sex chromosome considerations @@ -73,7 +73,7 @@ def _get_loop_covariates(ds: Dataset, dosage: str = None): def _get_core_covariates( ds: Dataset, covariates: Sequence[str], add_intercept: bool = False -): +) -> Array: if not add_intercept and not covariates: raise ValueError( "At least one covariate must be provided when `add_intercept`=False" @@ -94,7 +94,7 @@ def linear_regression( dosage: str, trait: str, add_intercept: bool = True, -): +) -> Dataset: """Run linear regression to identify continuous trait associations with genetic variants This method solves OLS regressions for each variant simultaneously and reports From 2c386e21f72e10d48c35c99794fdddef6ad92509 Mon Sep 17 00:00:00 2001 From: eric-czech Date: Thu, 9 Jul 2020 15:38:58 -0400 Subject: [PATCH 17/20] Filter statsmodels warning on import, remove dim constants, change output variable names --- setup.cfg | 2 +- sgkit/__init__.py | 2 ++ sgkit/stats/association.py | 14 ++++++-------- sgkit/tests/test_association.py | 17 +++++++++++++++-- 4 files changed, 24 insertions(+), 11 deletions(-) diff --git a/setup.cfg b/setup.cfg index 700ae0930..3d335c6b4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -50,7 +50,7 @@ ignore = [isort] default_section = THIRDPARTY known_first_party = sgkit -known_third_party = dask,numpy,pandas,pytest,setuptools,statsmodels,xarray +known_third_party = dask,numpy,pandas,pytest,setuptools,xarray multi_line_output = 3 include_trailing_comma = True force_grid_wrap = 0 diff --git a/sgkit/__init__.py b/sgkit/__init__.py index 8b170a470..3340b680c 100644 --- a/sgkit/__init__.py +++ b/sgkit/__init__.py @@ -5,6 +5,7 @@ DIM_VARIANT, create_genotype_call_dataset, ) +from .stats.association import linear_regression __all__ = [ "DIM_ALLELE", @@ -12,4 +13,5 @@ "DIM_SAMPLE", "DIM_VARIANT", "create_genotype_call_dataset", + "linear_regression", ] diff --git a/sgkit/stats/association.py b/sgkit/stats/association.py index 9db8b5e8a..d9939efd8 100644 --- a/sgkit/stats/association.py +++ b/sgkit/stats/association.py @@ -7,8 +7,6 @@ from dask.array import Array, stats from xarray import Dataset -from ..api import DIM_PLOIDY, DIM_VARIANT - LinearRegressionResult = collections.namedtuple( "LinearRegressionResult", ["beta", "t_value", "p_value"] ) @@ -65,7 +63,7 @@ def _get_loop_covariates(ds: Dataset, dosage: str = None) -> Array: if dosage is None: # TODO: This should be (probably gwas-specific) allele # count with sex chromosome considerations - G = ds["call/genotype"].sum(dim=DIM_PLOIDY) # pragma: no cover + G = ds["call/genotype"].sum(dim="ploidy") # pragma: no cover else: G = ds[dosage] return da.asarray(G.data) @@ -150,9 +148,9 @@ def linear_regression( y = da.asarray(ds[trait].data) res = _linear_regression(G.T, Z, y) return xr.Dataset( - dict( - beta=(DIM_VARIANT, res.beta), - t_value=(DIM_VARIANT, res.t_value), - p_value=(DIM_VARIANT, res.p_value), - ) + { + "variant/beta": ("variants", res.beta), + "variant/t_value": ("variants", res.t_value), + "variant/p_value": ("variants", res.p_value), + } ) diff --git a/sgkit/tests/test_association.py b/sgkit/tests/test_association.py index 6ec80ae0c..5b852f09d 100644 --- a/sgkit/tests/test_association.py +++ b/sgkit/tests/test_association.py @@ -1,11 +1,19 @@ +import warnings + import numpy as np import pandas as pd import pytest -import statsmodels.api as sm import xarray as xr from sgkit.stats.association import linear_regression +with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + # Ignore: DeprecationWarning: Using or importing the ABCs from 'collections' + # instead of from 'collections.abc' is deprecated since Python 3.3, + # and in 3.9 it will stop working + import statsmodels.api as sm + def _generate_test_data(n=100, m=10, p=3, e_std=0.001, b_zero_slice=None, seed=1): """Test data simulator for multiple variant associations to a continuous outcome @@ -107,7 +115,12 @@ def _get_statistics(ds, add_intercept, **kwargs): **kwargs, ) res = _sm_statistics(ds, i, add_intercept) - df_pred.append(dsr.to_dataframe().iloc[i].to_dict()) + df_pred.append( + dsr.to_dataframe() + .rename(columns=lambda c: c.replace("variant/", "")) + .iloc[i] + .to_dict() + ) df_true.append(dict(t_value=res.tvalues[0], p_value=res.pvalues[0])) return pd.DataFrame(df_pred), pd.DataFrame(df_true) From a4835a295c667deb6f08e01bb24b6432d3cc3fa8 Mon Sep 17 00:00:00 2001 From: eric-czech Date: Fri, 10 Jul 2020 17:40:41 -0400 Subject: [PATCH 18/20] Renaming linear_regression --- sgkit/__init__.py | 4 ++-- sgkit/stats/association.py | 6 +++--- sgkit/tests/test_association.py | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sgkit/__init__.py b/sgkit/__init__.py index 3340b680c..6417b5646 100644 --- a/sgkit/__init__.py +++ b/sgkit/__init__.py @@ -5,7 +5,7 @@ DIM_VARIANT, create_genotype_call_dataset, ) -from .stats.association import linear_regression +from .stats.association import gwas_linear_regression __all__ = [ "DIM_ALLELE", @@ -13,5 +13,5 @@ "DIM_SAMPLE", "DIM_VARIANT", "create_genotype_call_dataset", - "linear_regression", + "gwas_linear_regression", ] diff --git a/sgkit/stats/association.py b/sgkit/stats/association.py index d9939efd8..a4e6194d8 100644 --- a/sgkit/stats/association.py +++ b/sgkit/stats/association.py @@ -12,7 +12,7 @@ ) -def _linear_regression(G, X, y) -> LinearRegressionResult: +def _gwas_linear_regression(G, X, y) -> LinearRegressionResult: """Efficient linear regression estimation for multiple covariate sets Parameters @@ -86,7 +86,7 @@ def _get_core_covariates( return X.rechunk((None, -1)) -def linear_regression( +def gwas_linear_regression( ds: Dataset, covariates: Sequence[str], dosage: str, @@ -146,7 +146,7 @@ def linear_regression( G = _get_loop_covariates(ds, dosage=dosage) Z = _get_core_covariates(ds, covariates, add_intercept=add_intercept) y = da.asarray(ds[trait].data) - res = _linear_regression(G.T, Z, y) + res = _gwas_linear_regression(G.T, Z, y) return xr.Dataset( { "variant/beta": ("variants", res.beta), diff --git a/sgkit/tests/test_association.py b/sgkit/tests/test_association.py index 5b852f09d..533cf9be7 100644 --- a/sgkit/tests/test_association.py +++ b/sgkit/tests/test_association.py @@ -5,7 +5,7 @@ import pytest import xarray as xr -from sgkit.stats.association import linear_regression +from sgkit.stats.association import gwas_linear_regression with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) @@ -107,7 +107,7 @@ def _sm_statistics(ds, i, add_intercept): def _get_statistics(ds, add_intercept, **kwargs): df_pred, df_true = [], [] for i in range(ds.dims["variant"]): - dsr = linear_regression( + dsr = gwas_linear_regression( ds, dosage="dosage", trait=f"trait_{i}", @@ -156,6 +156,6 @@ def validate(dfp, dft): def test_linear_regression_raise_on_no_covars(ds): with pytest.raises(ValueError): - linear_regression( + gwas_linear_regression( ds, covariates=[], dosage="dosage", trait="trait_0", add_intercept=False ) From f2c0352fe5da2595cc92fa1eda43eabc71d57075 Mon Sep 17 00:00:00 2001 From: eric-czech Date: Fri, 17 Jul 2020 09:55:53 -0400 Subject: [PATCH 19/20] Fixing mypy errors --- .pre-commit-config.yaml | 2 +- setup.cfg | 6 ++++ sgkit/stats/association.py | 10 +++++-- sgkit/tests/test_association.py | 50 +++++++++++++++++++-------------- sgkit/typing.py | 6 +++- 5 files changed, 48 insertions(+), 26 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0b114f16f..1dc9d203a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -29,5 +29,5 @@ repos: rev: v0.782 hooks: - id: mypy - args: ["--strict"] + args: ["--strict", "--show-error-codes"] additional_dependencies: ["numpy", "xarray"] diff --git a/setup.cfg b/setup.cfg index f2c098dcd..c8097812a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -60,8 +60,14 @@ line_length = 88 [mypy-numpy.*] ignore_missing_imports = True +[mypy-pandas.*] +ignore_missing_imports = True +[mypy-dask.*] +ignore_missing_imports = True [mypy-pytest.*] ignore_missing_imports = True +[mypy-statsmodels.*] +ignore_missing_imports = True [mypy-setuptools] ignore_missing_imports = True [mypy-sgkit.tests.*] diff --git a/sgkit/stats/association.py b/sgkit/stats/association.py index a4e6194d8..1f280bf36 100644 --- a/sgkit/stats/association.py +++ b/sgkit/stats/association.py @@ -1,5 +1,5 @@ import collections -from typing import Sequence +from typing import Optional, Sequence import dask.array as da import numpy as np @@ -7,12 +7,16 @@ from dask.array import Array, stats from xarray import Dataset +from ..typing import ArrayLike + LinearRegressionResult = collections.namedtuple( "LinearRegressionResult", ["beta", "t_value", "p_value"] ) -def _gwas_linear_regression(G, X, y) -> LinearRegressionResult: +def _gwas_linear_regression( + G: ArrayLike, X: ArrayLike, y: ArrayLike +) -> LinearRegressionResult: """Efficient linear regression estimation for multiple covariate sets Parameters @@ -59,7 +63,7 @@ def _gwas_linear_regression(G, X, y) -> LinearRegressionResult: return LinearRegressionResult(beta=b, t_value=t_val, p_value=p_val) -def _get_loop_covariates(ds: Dataset, dosage: str = None) -> Array: +def _get_loop_covariates(ds: Dataset, dosage: Optional[str] = None) -> Array: if dosage is None: # TODO: This should be (probably gwas-specific) allele # count with sex chromosome considerations diff --git a/sgkit/tests/test_association.py b/sgkit/tests/test_association.py index 533cf9be7..2a3a8b69b 100644 --- a/sgkit/tests/test_association.py +++ b/sgkit/tests/test_association.py @@ -1,11 +1,15 @@ import warnings +from typing import Any, Dict, List, Optional, Tuple import numpy as np import pandas as pd import pytest import xarray as xr +from pandas import DataFrame +from xarray import Dataset from sgkit.stats.association import gwas_linear_regression +from sgkit.typing import ArrayLike with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) @@ -13,9 +17,17 @@ # instead of from 'collections.abc' is deprecated since Python 3.3, # and in 3.9 it will stop working import statsmodels.api as sm + from statsmodels.regression.linear_model import RegressionResultsWrapper -def _generate_test_data(n=100, m=10, p=3, e_std=0.001, b_zero_slice=None, seed=1): +def _generate_test_data( + n: int = 100, + m: int = 10, + p: int = 3, + e_std: float = 0.001, + b_zero_slice: Optional[slice] = None, + seed: Optional[int] = 1, +) -> Tuple[ArrayLike, ArrayLike, ArrayLike, ArrayLike]: """Test data simulator for multiple variant associations to a continuous outcome Outcomes for each variant are simulated separately based on linear combinations @@ -40,20 +52,12 @@ def _generate_test_data(n=100, m=10, p=3, e_std=0.001, b_zero_slice=None, seed=1 Returns ------- - n : int - Number of samples - m : int - Number of variants - p : int - Number of covariates g : (n, m) array-like Simulated genotype dosage x : (n, p) array-like Simulated covariates bg : (m,) array-like Variant betas - bx : (p,) array-like - Covariate betas ys : (m, n) array-like Outcomes for each column in genotypes i.e. variant """ @@ -69,28 +73,29 @@ def _generate_test_data(n=100, m=10, p=3, e_std=0.001, b_zero_slice=None, seed=1 # Simulate y values using each variant independently ys = np.array([g[:, i] * bg[i] + x @ bx + e for i in range(m)]) - return n, m, p, g, x, bg, bx, ys + return g, x, bg, ys -def _generate_test_dataset(**kwargs): - n, m, p, g, x, bg, bx, ys = _generate_test_data(**kwargs) +def _generate_test_dataset(**kwargs: Any) -> Dataset: + g, x, bg, ys = _generate_test_data(**kwargs) data_vars = {} - # TODO: use literals or constants for dimension names? data_vars["dosage"] = (["variant", "sample"], g.T) for i in range(x.shape[1]): data_vars[f"covar_{i}"] = (["sample"], x[:, i]) for i in range(len(ys)): data_vars[f"trait_{i}"] = (["sample"], ys[i]) attrs = dict(beta=bg) - return xr.Dataset(data_vars, attrs=attrs) + return xr.Dataset(data_vars, attrs=attrs) # type: ignore[arg-type] -@pytest.fixture -def ds(): +@pytest.fixture # type: ignore[misc] +def ds() -> Dataset: return _generate_test_dataset() -def _sm_statistics(ds, i, add_intercept): +def _sm_statistics( + ds: Dataset, i: int, add_intercept: bool +) -> RegressionResultsWrapper: X = [] # Make sure first independent variable is variant X.append(ds["dosage"].values[i]) @@ -104,8 +109,11 @@ def _sm_statistics(ds, i, add_intercept): return sm.OLS(y, X, hasconst=True).fit() -def _get_statistics(ds, add_intercept, **kwargs): - df_pred, df_true = [], [] +def _get_statistics( + ds: Dataset, add_intercept: bool, **kwargs: Any +) -> Tuple[DataFrame, DataFrame]: + df_pred: List[Dict[str, Any]] = [] + df_true: List[Dict[str, Any]] = [] for i in range(ds.dims["variant"]): dsr = gwas_linear_regression( ds, @@ -116,7 +124,7 @@ def _get_statistics(ds, add_intercept, **kwargs): ) res = _sm_statistics(ds, i, add_intercept) df_pred.append( - dsr.to_dataframe() + dsr.to_dataframe() # type: ignore[no-untyped-call] .rename(columns=lambda c: c.replace("variant/", "")) .iloc[i] .to_dict() @@ -126,7 +134,7 @@ def _get_statistics(ds, add_intercept, **kwargs): def test_linear_regression_statistics(ds): - def validate(dfp, dft): + def validate(dfp: DataFrame, dft: DataFrame) -> None: print(dfp) print(dft) diff --git a/sgkit/typing.py b/sgkit/typing.py index 3dea6b2be..fab53b2a9 100644 --- a/sgkit/typing.py +++ b/sgkit/typing.py @@ -1,3 +1,7 @@ -from typing import Any +from typing import Any, Union + +import dask.array as da +import numpy as np DType = Any +ArrayLike = Union[np.ndarray, da.Array] From 273375e5d949ae3cd6b22572a6fccb2bd3064f0f Mon Sep 17 00:00:00 2001 From: eric-czech Date: Fri, 17 Jul 2020 11:17:15 -0400 Subject: [PATCH 20/20] Requested changes --- .pre-commit-config.yaml | 2 +- sgkit/stats/association.py | 11 +++++++---- sgkit/tests/test_association.py | 4 ++-- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1dc9d203a..48b567f70 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -30,4 +30,4 @@ repos: hooks: - id: mypy args: ["--strict", "--show-error-codes"] - additional_dependencies: ["numpy", "xarray"] + additional_dependencies: ["numpy", "xarray", "dask[array]", "scipy"] diff --git a/sgkit/stats/association.py b/sgkit/stats/association.py index 1f280bf36..65e7061f6 100644 --- a/sgkit/stats/association.py +++ b/sgkit/stats/association.py @@ -1,4 +1,4 @@ -import collections +from dataclasses import dataclass from typing import Optional, Sequence import dask.array as da @@ -9,9 +9,12 @@ from ..typing import ArrayLike -LinearRegressionResult = collections.namedtuple( - "LinearRegressionResult", ["beta", "t_value", "p_value"] -) + +@dataclass +class LinearRegressionResult: + beta: ArrayLike + t_value: ArrayLike + p_value: ArrayLike def _gwas_linear_regression( diff --git a/sgkit/tests/test_association.py b/sgkit/tests/test_association.py index 2a3a8b69b..97407a6a5 100644 --- a/sgkit/tests/test_association.py +++ b/sgkit/tests/test_association.py @@ -88,7 +88,7 @@ def _generate_test_dataset(**kwargs: Any) -> Dataset: return xr.Dataset(data_vars, attrs=attrs) # type: ignore[arg-type] -@pytest.fixture # type: ignore[misc] +@pytest.fixture(scope="module") # type: ignore[misc] def ds() -> Dataset: return _generate_test_dataset() @@ -163,7 +163,7 @@ def validate(dfp: DataFrame, dft: DataFrame) -> None: def test_linear_regression_raise_on_no_covars(ds): - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="At least one covariate must be provided"): gwas_linear_regression( ds, covariates=[], dosage="dosage", trait="trait_0", add_intercept=False )