From 7798af4fd0d7e9bc6467d03dfbde82d240a5f036 Mon Sep 17 00:00:00 2001
From: Tom White <tom.e.white@gmail.com>
Date: Mon, 29 Jun 2020 17:22:56 +0100
Subject: [PATCH 01/20] Translate the discussion in
 https://discourse.smadstatgen.org/t/xarray-conventions-for-variation-data/44
 into code

---
 setup.cfg                 |  2 +-
 sgkit/__init__.py         | 16 +++++++--
 sgkit/api.py              | 70 +++++++++++++++++++++++++++++++++++++++
 sgkit/core.py             |  7 ----
 sgkit/tests/test_api.py   | 45 +++++++++++++++++++++++++
 sgkit/tests/test_core.py  | 10 ------
 sgkit/tests/test_utils.py | 20 +++++++++++
 sgkit/utils.py            | 24 ++++++++++++++
 8 files changed, 174 insertions(+), 20 deletions(-)
 create mode 100644 sgkit/api.py
 delete mode 100644 sgkit/core.py
 create mode 100644 sgkit/tests/test_api.py
 delete mode 100644 sgkit/tests/test_core.py
 create mode 100644 sgkit/tests/test_utils.py
 create mode 100644 sgkit/utils.py

diff --git a/setup.cfg b/setup.cfg
index d32a77d78..d9f4c83a7 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -16,7 +16,7 @@ ignore =
 [isort]
 default_section = THIRDPARTY
 known_first_party = sgkit
-known_third_party = numpy,xarray
+known_third_party = numpy,pytest,xarray
 multi_line_output = 3
 include_trailing_comma = True
 force_grid_wrap = 0
diff --git a/sgkit/__init__.py b/sgkit/__init__.py
index 15bd521e2..8b170a470 100644
--- a/sgkit/__init__.py
+++ b/sgkit/__init__.py
@@ -1,3 +1,15 @@
-from .core import create_genotype_call_dataset  # noqa: F401
+from .api import (  # noqa: F401
+    DIM_ALLELE,
+    DIM_PLOIDY,
+    DIM_SAMPLE,
+    DIM_VARIANT,
+    create_genotype_call_dataset,
+)
 
-__all__ = ["create_genotype_call_dataset"]
+__all__ = [
+    "DIM_ALLELE",
+    "DIM_PLOIDY",
+    "DIM_SAMPLE",
+    "DIM_VARIANT",
+    "create_genotype_call_dataset",
+]
diff --git a/sgkit/api.py b/sgkit/api.py
new file mode 100644
index 000000000..708209b4d
--- /dev/null
+++ b/sgkit/api.py
@@ -0,0 +1,70 @@
+from typing import Any, Hashable, Mapping
+
+import xarray as xr
+
+from .utils import check_array_like
+
+DIM_VARIANT = "variant"
+DIM_SAMPLE = "sample"
+DIM_PLOIDY = "ploidy"
+DIM_ALLELE = "allele"
+DIM_GENOTYPE = "genotype"
+
+
+def create_genotype_call_dataset(
+    variant_contig: Any,
+    variant_pos: Any,
+    variant_alleles: Any,
+    sample_id: Any,
+    call_gt: Any,
+    call_gt_phased: Any = None,
+    variant_id: Any = None,
+) -> xr.Dataset:
+    """Create a dataset of genotype calls.
+
+    Parameters
+    ----------
+    variant_contig : array_like, np.int8
+        The (index of the) contig for each variant.
+    variant_pos : array_like, np.int32
+        The reference position of the variant.
+    variant_alleles : array_like, S1
+        The possible alleles for the variant.
+    sample_id : array_like, str
+        The unique identifier of the sample.
+    call_gt : array_like, np.int8
+        Genotype, encoded as allele values (0 for the reference, 1 for
+        the first allele, 2 for the second allele), or -1 to indicate a
+        missing value.
+    call_gt_phased : array_like, bool, optional
+        A flag for each call indicating if it is phased or not. If
+        omitted all calls are unphased.
+    variant_id: array_like, str, optional
+        The unique identifier of the variant.
+
+    Returns
+    -------
+    xr.Dataset
+        The dataset of genotype calls.
+
+    """
+    check_array_like(variant_contig, kind="i", ndim=1)
+    check_array_like(variant_pos, kind="i", ndim=1)
+    check_array_like(variant_alleles, kind="S", ndim=2)
+    check_array_like(sample_id, kind="U", ndim=1)
+    check_array_like(call_gt, kind="i", ndim=3)
+    data_vars: Mapping[Hashable, Any] = {
+        "variant/CONTIG": ([DIM_VARIANT], variant_contig),
+        "variant/POS": ([DIM_VARIANT], variant_pos),
+        "variant/ALLELES": ([DIM_VARIANT, DIM_ALLELE], variant_alleles),
+        "sample/ID": ([DIM_SAMPLE], sample_id),
+        "call/GT": ([DIM_VARIANT, DIM_SAMPLE, DIM_PLOIDY], call_gt),
+        "call/GT_mask": ([DIM_VARIANT, DIM_SAMPLE, DIM_PLOIDY], call_gt < 0),
+    }
+    if call_gt_phased is not None:
+        check_array_like(call_gt_phased, kind="b", ndim=2)
+        data_vars["call/GT_phased"] = ([DIM_VARIANT, DIM_SAMPLE], call_gt_phased)
+    if variant_id is not None:
+        check_array_like(variant_id, kind="U", ndim=1)
+        data_vars["variant/ID"] = ([DIM_VARIANT], variant_id)
+    return xr.Dataset(data_vars)
diff --git a/sgkit/core.py b/sgkit/core.py
deleted file mode 100644
index b34368b42..000000000
--- a/sgkit/core.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from typing import Any
-
-import xarray as xr
-
-
-def create_genotype_call_dataset(arr: Any) -> xr.Dataset:
-    return xr.Dataset({"data": (["variant", "sample", "genotype"], arr)})
diff --git a/sgkit/tests/test_api.py b/sgkit/tests/test_api.py
new file mode 100644
index 000000000..4768ec7bd
--- /dev/null
+++ b/sgkit/tests/test_api.py
@@ -0,0 +1,45 @@
+import numpy as np
+from numpy.testing import assert_array_equal
+
+from sgkit import (
+    DIM_ALLELE,
+    DIM_PLOIDY,
+    DIM_SAMPLE,
+    DIM_VARIANT,
+    create_genotype_call_dataset,
+)
+
+
+def test_create_genotype_call_dataset():
+    variant_contig = np.array([0, 0], dtype="i1")
+    variant_pos = np.array([1000, 2000], dtype="i4")
+    variant_alleles = np.array([["A", "C"], ["G", "A"]], dtype="S1")
+    variant_id = np.array(["rs1", "rs2"], dtype=str)
+    sample_id = np.array(["sample_1", "sample_2", "sample_3"], dtype=str)
+    call_gt = np.array(
+        [[[0, 0], [0, 1], [1, 0]], [[-1, 0], [0, -1], [-1, -1]]], dtype="i1"
+    )
+    call_gt_phased = np.array([[True, True, False], [True, False, False]], dtype=bool)
+    ds = create_genotype_call_dataset(
+        variant_contig,
+        variant_pos,
+        variant_alleles,
+        sample_id,
+        call_gt,
+        call_gt_phased=call_gt_phased,
+        variant_id=variant_id,
+    )
+
+    assert DIM_VARIANT in ds.dims
+    assert DIM_SAMPLE in ds.dims
+    assert DIM_PLOIDY in ds.dims
+    assert DIM_ALLELE in ds.dims
+
+    assert_array_equal(ds["variant/CONTIG"], variant_contig)
+    assert_array_equal(ds["variant/POS"], variant_pos)
+    assert_array_equal(ds["variant/ALLELES"], variant_alleles)
+    assert_array_equal(ds["variant/ID"], variant_id)
+    assert_array_equal(ds["sample/ID"], sample_id)
+    assert_array_equal(ds["call/GT"], call_gt)
+    assert_array_equal(ds["call/GT_mask"], call_gt < 0)
+    assert_array_equal(ds["call/GT_phased"], call_gt_phased)
diff --git a/sgkit/tests/test_core.py b/sgkit/tests/test_core.py
deleted file mode 100644
index fc622aaa4..000000000
--- a/sgkit/tests/test_core.py
+++ /dev/null
@@ -1,10 +0,0 @@
-import numpy as np
-
-from sgkit import create_genotype_call_dataset
-
-
-def test_create_genotype_call_dataset():
-    arr = np.random.rand(4, 5, 3)  # NB: not valid genotype data!
-    ds = create_genotype_call_dataset(arr)
-    assert "sample" in ds.dims
-    assert "variant" in ds.dims
diff --git a/sgkit/tests/test_utils.py b/sgkit/tests/test_utils.py
new file mode 100644
index 000000000..4688619d2
--- /dev/null
+++ b/sgkit/tests/test_utils.py
@@ -0,0 +1,20 @@
+import numpy as np
+import pytest
+
+from sgkit.utils import check_array_like
+
+
+def test_check_array_like():
+    with pytest.raises(TypeError):
+        check_array_like("foo")
+    a = np.arange(100, dtype="i4")
+    with pytest.raises(TypeError):
+        check_array_like(a, dtype="i8")
+    with pytest.raises(TypeError):
+        check_array_like(a, dtype={"i1", "i2"})
+    with pytest.raises(TypeError):
+        check_array_like(a, kind="f")
+    with pytest.raises(ValueError):
+        check_array_like(a, ndim=2)
+    with pytest.raises(ValueError):
+        check_array_like(a, ndim={2, 3})
diff --git a/sgkit/utils.py b/sgkit/utils.py
new file mode 100644
index 000000000..47d2245a9
--- /dev/null
+++ b/sgkit/utils.py
@@ -0,0 +1,24 @@
+import numpy as np
+
+
+def check_array_like(a, dtype=None, kind=None, ndim=None):
+    array_attrs = "ndim", "dtype", "shape"
+    for k in array_attrs:
+        if not hasattr(a, k):
+            raise TypeError
+    if dtype is not None:
+        if isinstance(dtype, set):
+            dtype = {np.dtype(t) for t in dtype}
+            if a.dtype not in dtype:
+                raise TypeError
+        elif a.dtype != np.dtype(dtype):
+            raise TypeError
+    if kind is not None:
+        if a.dtype.kind not in kind:
+            raise TypeError
+    if ndim is not None:
+        if isinstance(ndim, set):
+            if a.ndim not in ndim:
+                raise ValueError
+        elif ndim != a.ndim:
+            raise ValueError

From 88dbce3e075d11797cf4c76c159a5f3b512463a3 Mon Sep 17 00:00:00 2001
From: Tom White <tom.e.white@gmail.com>
Date: Tue, 30 Jun 2020 17:03:15 +0100
Subject: [PATCH 02/20] Loosen integer array types in documentation.

---
 sgkit/api.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sgkit/api.py b/sgkit/api.py
index 708209b4d..b5aa7e3ea 100644
--- a/sgkit/api.py
+++ b/sgkit/api.py
@@ -24,15 +24,15 @@ def create_genotype_call_dataset(
 
     Parameters
     ----------
-    variant_contig : array_like, np.int8
+    variant_contig : array_like, int
         The (index of the) contig for each variant.
-    variant_pos : array_like, np.int32
+    variant_pos : array_like, int
         The reference position of the variant.
     variant_alleles : array_like, S1
         The possible alleles for the variant.
     sample_id : array_like, str
         The unique identifier of the sample.
-    call_gt : array_like, np.int8
+    call_gt : array_like, int
         Genotype, encoded as allele values (0 for the reference, 1 for
         the first allele, 2 for the second allele), or -1 to indicate a
         missing value.

From a5bca262715d457f8e85e9b34e667ceb0a0b6085 Mon Sep 17 00:00:00 2001
From: Tom White <tom.e.white@gmail.com>
Date: Tue, 30 Jun 2020 17:42:00 +0100
Subject: [PATCH 03/20] Store contig names as an attribute

---
 sgkit/api.py            | 8 ++++++--
 sgkit/tests/test_api.py | 3 +++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/sgkit/api.py b/sgkit/api.py
index b5aa7e3ea..e0220eed2 100644
--- a/sgkit/api.py
+++ b/sgkit/api.py
@@ -1,4 +1,4 @@
-from typing import Any, Hashable, Mapping
+from typing import Any, Hashable, List, Mapping
 
 import xarray as xr
 
@@ -12,6 +12,7 @@
 
 
 def create_genotype_call_dataset(
+    variant_contig_names: List[str],
     variant_contig: Any,
     variant_pos: Any,
     variant_alleles: Any,
@@ -24,6 +25,8 @@ def create_genotype_call_dataset(
 
     Parameters
     ----------
+    variant_contig_names : list of str
+        The contig names.
     variant_contig : array_like, int
         The (index of the) contig for each variant.
     variant_pos : array_like, int
@@ -67,4 +70,5 @@ def create_genotype_call_dataset(
     if variant_id is not None:
         check_array_like(variant_id, kind="U", ndim=1)
         data_vars["variant/ID"] = ([DIM_VARIANT], variant_id)
-    return xr.Dataset(data_vars)
+    attrs = {"contigs": variant_contig_names}
+    return xr.Dataset(data_vars=data_vars, attrs=attrs)
diff --git a/sgkit/tests/test_api.py b/sgkit/tests/test_api.py
index 4768ec7bd..9ea2af739 100644
--- a/sgkit/tests/test_api.py
+++ b/sgkit/tests/test_api.py
@@ -11,6 +11,7 @@
 
 
 def test_create_genotype_call_dataset():
+    variant_contig_names = ["chr1"]
     variant_contig = np.array([0, 0], dtype="i1")
     variant_pos = np.array([1000, 2000], dtype="i4")
     variant_alleles = np.array([["A", "C"], ["G", "A"]], dtype="S1")
@@ -21,6 +22,7 @@ def test_create_genotype_call_dataset():
     )
     call_gt_phased = np.array([[True, True, False], [True, False, False]], dtype=bool)
     ds = create_genotype_call_dataset(
+        variant_contig_names,
         variant_contig,
         variant_pos,
         variant_alleles,
@@ -35,6 +37,7 @@ def test_create_genotype_call_dataset():
     assert DIM_PLOIDY in ds.dims
     assert DIM_ALLELE in ds.dims
 
+    assert ds.attrs["contigs"] == variant_contig_names
     assert_array_equal(ds["variant/CONTIG"], variant_contig)
     assert_array_equal(ds["variant/POS"], variant_pos)
     assert_array_equal(ds["variant/ALLELES"], variant_alleles)

From b34b2708bca67b3f80222967827c1f9fce6753b0 Mon Sep 17 00:00:00 2001
From: Tom White <tom.e.white@gmail.com>
Date: Wed, 1 Jul 2020 10:16:32 +0100
Subject: [PATCH 04/20] Move away from ALL_CAPS names. Pluralise dimension
 names.

---
 sgkit/api.py            | 50 +++++++++++++++++++++++------------------
 sgkit/tests/test_api.py | 30 +++++++++++++------------
 2 files changed, 44 insertions(+), 36 deletions(-)

diff --git a/sgkit/api.py b/sgkit/api.py
index e0220eed2..84c1b4847 100644
--- a/sgkit/api.py
+++ b/sgkit/api.py
@@ -4,21 +4,21 @@
 
 from .utils import check_array_like
 
-DIM_VARIANT = "variant"
-DIM_SAMPLE = "sample"
+DIM_VARIANT = "variants"
+DIM_SAMPLE = "samples"
 DIM_PLOIDY = "ploidy"
-DIM_ALLELE = "allele"
-DIM_GENOTYPE = "genotype"
+DIM_ALLELE = "alleles"
+DIM_GENOTYPE = "genotypes"
 
 
 def create_genotype_call_dataset(
     variant_contig_names: List[str],
     variant_contig: Any,
-    variant_pos: Any,
+    variant_position: Any,
     variant_alleles: Any,
     sample_id: Any,
-    call_gt: Any,
-    call_gt_phased: Any = None,
+    call_genotype: Any,
+    call_genotype_phased: Any = None,
     variant_id: Any = None,
 ) -> xr.Dataset:
     """Create a dataset of genotype calls.
@@ -29,17 +29,17 @@ def create_genotype_call_dataset(
         The contig names.
     variant_contig : array_like, int
         The (index of the) contig for each variant.
-    variant_pos : array_like, int
+    variant_position : array_like, int
         The reference position of the variant.
     variant_alleles : array_like, S1
         The possible alleles for the variant.
     sample_id : array_like, str
         The unique identifier of the sample.
-    call_gt : array_like, int
+    call_genotype : array_like, int
         Genotype, encoded as allele values (0 for the reference, 1 for
         the first allele, 2 for the second allele), or -1 to indicate a
         missing value.
-    call_gt_phased : array_like, bool, optional
+    call_genotype_phased : array_like, bool, optional
         A flag for each call indicating if it is phased or not. If
         omitted all calls are unphased.
     variant_id: array_like, str, optional
@@ -52,23 +52,29 @@ def create_genotype_call_dataset(
 
     """
     check_array_like(variant_contig, kind="i", ndim=1)
-    check_array_like(variant_pos, kind="i", ndim=1)
+    check_array_like(variant_position, kind="i", ndim=1)
     check_array_like(variant_alleles, kind="S", ndim=2)
     check_array_like(sample_id, kind="U", ndim=1)
-    check_array_like(call_gt, kind="i", ndim=3)
+    check_array_like(call_genotype, kind="i", ndim=3)
     data_vars: Mapping[Hashable, Any] = {
-        "variant/CONTIG": ([DIM_VARIANT], variant_contig),
-        "variant/POS": ([DIM_VARIANT], variant_pos),
-        "variant/ALLELES": ([DIM_VARIANT, DIM_ALLELE], variant_alleles),
-        "sample/ID": ([DIM_SAMPLE], sample_id),
-        "call/GT": ([DIM_VARIANT, DIM_SAMPLE, DIM_PLOIDY], call_gt),
-        "call/GT_mask": ([DIM_VARIANT, DIM_SAMPLE, DIM_PLOIDY], call_gt < 0),
+        "variant/contig": ([DIM_VARIANT], variant_contig),
+        "variant/position": ([DIM_VARIANT], variant_position),
+        "variant/alleles": ([DIM_VARIANT, DIM_ALLELE], variant_alleles),
+        "sample/id": ([DIM_SAMPLE], sample_id),
+        "call/genotype": ([DIM_VARIANT, DIM_SAMPLE, DIM_PLOIDY], call_genotype),
+        "call/genotype_mask": (
+            [DIM_VARIANT, DIM_SAMPLE, DIM_PLOIDY],
+            call_genotype < 0,
+        ),
     }
-    if call_gt_phased is not None:
-        check_array_like(call_gt_phased, kind="b", ndim=2)
-        data_vars["call/GT_phased"] = ([DIM_VARIANT, DIM_SAMPLE], call_gt_phased)
+    if call_genotype_phased is not None:
+        check_array_like(call_genotype_phased, kind="b", ndim=2)
+        data_vars["call/genotype_phased"] = (
+            [DIM_VARIANT, DIM_SAMPLE],
+            call_genotype_phased,
+        )
     if variant_id is not None:
         check_array_like(variant_id, kind="U", ndim=1)
-        data_vars["variant/ID"] = ([DIM_VARIANT], variant_id)
+        data_vars["variant/id"] = ([DIM_VARIANT], variant_id)
     attrs = {"contigs": variant_contig_names}
     return xr.Dataset(data_vars=data_vars, attrs=attrs)
diff --git a/sgkit/tests/test_api.py b/sgkit/tests/test_api.py
index 9ea2af739..9114a9554 100644
--- a/sgkit/tests/test_api.py
+++ b/sgkit/tests/test_api.py
@@ -13,22 +13,24 @@
 def test_create_genotype_call_dataset():
     variant_contig_names = ["chr1"]
     variant_contig = np.array([0, 0], dtype="i1")
-    variant_pos = np.array([1000, 2000], dtype="i4")
+    variant_position = np.array([1000, 2000], dtype="i4")
     variant_alleles = np.array([["A", "C"], ["G", "A"]], dtype="S1")
     variant_id = np.array(["rs1", "rs2"], dtype=str)
     sample_id = np.array(["sample_1", "sample_2", "sample_3"], dtype=str)
-    call_gt = np.array(
+    call_genotype = np.array(
         [[[0, 0], [0, 1], [1, 0]], [[-1, 0], [0, -1], [-1, -1]]], dtype="i1"
     )
-    call_gt_phased = np.array([[True, True, False], [True, False, False]], dtype=bool)
+    call_genotype_phased = np.array(
+        [[True, True, False], [True, False, False]], dtype=bool
+    )
     ds = create_genotype_call_dataset(
         variant_contig_names,
         variant_contig,
-        variant_pos,
+        variant_position,
         variant_alleles,
         sample_id,
-        call_gt,
-        call_gt_phased=call_gt_phased,
+        call_genotype,
+        call_genotype_phased=call_genotype_phased,
         variant_id=variant_id,
     )
 
@@ -38,11 +40,11 @@ def test_create_genotype_call_dataset():
     assert DIM_ALLELE in ds.dims
 
     assert ds.attrs["contigs"] == variant_contig_names
-    assert_array_equal(ds["variant/CONTIG"], variant_contig)
-    assert_array_equal(ds["variant/POS"], variant_pos)
-    assert_array_equal(ds["variant/ALLELES"], variant_alleles)
-    assert_array_equal(ds["variant/ID"], variant_id)
-    assert_array_equal(ds["sample/ID"], sample_id)
-    assert_array_equal(ds["call/GT"], call_gt)
-    assert_array_equal(ds["call/GT_mask"], call_gt < 0)
-    assert_array_equal(ds["call/GT_phased"], call_gt_phased)
+    assert_array_equal(ds["variant/contig"], variant_contig)
+    assert_array_equal(ds["variant/position"], variant_position)
+    assert_array_equal(ds["variant/alleles"], variant_alleles)
+    assert_array_equal(ds["variant/id"], variant_id)
+    assert_array_equal(ds["sample/id"], sample_id)
+    assert_array_equal(ds["call/genotype"], call_genotype)
+    assert_array_equal(ds["call/genotype_mask"], call_genotype < 0)
+    assert_array_equal(ds["call/genotype_phased"], call_genotype_phased)

From 4aecff1db800aea2b85f1d194555968f9b4314ea Mon Sep 17 00:00:00 2001
From: eric-czech <eric.allen.czech@gmail.com>
Date: Fri, 3 Jul 2020 23:42:51 +0000
Subject: [PATCH 05/20] Initial single continuous phenotype regression
 implementation

---
 setup.cfg               |  2 +-
 sgkit/stats/__init__.py |  0
 sgkit/stats/gwas.py     | 88 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 89 insertions(+), 1 deletion(-)
 create mode 100644 sgkit/stats/__init__.py
 create mode 100644 sgkit/stats/gwas.py

diff --git a/setup.cfg b/setup.cfg
index dc5245e1a..35392d97e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -50,7 +50,7 @@ ignore =
 [isort]
 default_section = THIRDPARTY
 known_first_party = sgkit
-known_third_party = numpy,setuptools,xarray
+known_third_party = dask,numpy,pytest,setuptools,xarray
 multi_line_output = 3
 include_trailing_comma = True
 force_grid_wrap = 0
diff --git a/sgkit/stats/__init__.py b/sgkit/stats/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/sgkit/stats/gwas.py b/sgkit/stats/gwas.py
new file mode 100644
index 000000000..72e7a4435
--- /dev/null
+++ b/sgkit/stats/gwas.py
@@ -0,0 +1,88 @@
+import collections
+from typing import Sequence
+
+import dask.array as da
+import numpy as np
+from dask.array import stats
+from xarray import Dataset
+
+from ..api import DIM_PLOIDY, DIM_VARIANT
+
+LinearRegressionResult = collections.namedtuple(
+    "LinearRegressionResult", ["betas", "t_values", "p_values"]
+)
+
+
+def _linear_regression(X, Z, y) -> LinearRegressionResult:
+    """Efficient linear regression estimation for multiple covariate sets
+
+    Parameters
+    ----------
+    X : (M, N) array-like
+        "Loop" covariates for which a separate regression will be fit to
+        individual columns
+    Z : (M, P) array-like
+        "Core" covariates that are included in the regressions along
+        with each loop covariate
+    y : (M,)
+        Continuous outcome
+
+    Returns
+    -------
+    tuple
+        [description]
+    """
+    # Apply orthogonal projection to eliminate core covariates
+    print("Z", Z)
+    print("X", X)
+    print("y", y)
+    Xp = X - Z @ da.linalg.lstsq(Z, X)[0]
+    yp = y - Z @ da.linalg.lstsq(Z, y)[0]
+
+    # Estimate coefficients for each loop covariate
+    Xps = (Xp ** 2).sum(axis=0)
+    b = (Xp.T @ yp) / Xps
+
+    # Compute statistics and p values for each regression separately
+    dof = y.shape[0] - Z.shape[1] - 1
+    y_resid = yp[:, np.newaxis] - Xp * b
+    rss = (y_resid ** 2).sum(axis=0)
+    t_val = b / np.sqrt((rss / dof) / Xps)
+    p_val = 2 * stats.distributions.t.sf(np.abs(t_val), dof)
+
+    return LinearRegressionResult(betas=b, t_values=t_val, p_values=p_val)
+
+
+def _get_loop_covariates(ds: Dataset, dosage: str = None):
+    if dosage is None:
+        X = ds["call/genotype"].sum(dim=DIM_PLOIDY)
+    else:
+        X = ds[dosage]
+    return da.asarray(X.data)
+
+
+def _get_core_covariates(
+    ds: Dataset, covariates: Sequence[str], add_intercept: bool = False
+):
+    X = da.stack([da.asarray(ds[c].data) for c in covariates]).T
+    if add_intercept:
+        X = da.concatenate([da.ones((X.shape[0], 1)), X], axis=1)
+    return X.rechunk((-1, -1))
+
+
+def linear_regression(
+    ds: Dataset,
+    covariates: Sequence[str],
+    dosage: str,
+    trait: str,
+    add_intercept: bool = True,
+):
+    X = _get_loop_covariates(ds, dosage=dosage)
+    Z = _get_core_covariates(ds, covariates, add_intercept=add_intercept)
+    y = da.asarray(ds[trait].data)
+    res = _linear_regression(X.T, Z, y)
+    return ds.assign(
+        betas=(DIM_VARIANT, res.betas),
+        t_values=(DIM_VARIANT, res.t_values),
+        p_values=(DIM_VARIANT, res.p_values),
+    )

From 9ff934d48af2a50d8a6746fbc213b7009a9534b2 Mon Sep 17 00:00:00 2001
From: eric-czech <eric.allen.czech@gmail.com>
Date: Sat, 4 Jul 2020 13:25:54 +0000
Subject: [PATCH 06/20] Basic regression validation on simulated data

---
 setup.cfg                       |   2 +-
 sgkit/stats/association.py      | 138 ++++++++++++++++++++++++++++++++
 sgkit/stats/gwas.py             |  88 --------------------
 sgkit/tests/test_association.py | 108 +++++++++++++++++++++++++
 4 files changed, 247 insertions(+), 89 deletions(-)
 create mode 100644 sgkit/stats/association.py
 delete mode 100644 sgkit/stats/gwas.py
 create mode 100644 sgkit/tests/test_association.py

diff --git a/setup.cfg b/setup.cfg
index 35392d97e..3d335c6b4 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -50,7 +50,7 @@ ignore =
 [isort]
 default_section = THIRDPARTY
 known_first_party = sgkit
-known_third_party = dask,numpy,pytest,setuptools,xarray
+known_third_party = dask,numpy,pandas,pytest,setuptools,xarray
 multi_line_output = 3
 include_trailing_comma = True
 force_grid_wrap = 0
diff --git a/sgkit/stats/association.py b/sgkit/stats/association.py
new file mode 100644
index 000000000..d84ffe82b
--- /dev/null
+++ b/sgkit/stats/association.py
@@ -0,0 +1,138 @@
+import collections
+from typing import Sequence
+
+import dask.array as da
+import numpy as np
+import xarray as xr
+from dask.array import stats
+from xarray import Dataset
+
+from ..api import DIM_PLOIDY, DIM_VARIANT
+
+LinearRegressionResult = collections.namedtuple(
+    "LinearRegressionResult", ["betas", "t_values", "p_values"]
+)
+
+
+def _linear_regression(G, X, y) -> LinearRegressionResult:
+    """Efficient linear regression estimation for multiple covariate sets
+
+    Parameters
+    ----------
+    G : (M, N) array-like
+        "Loop" covariates for which a separate regression will be fit to
+        individual columns
+    X : (M, P) array-like
+        "Core" covariates that are included in the regressions along
+        with each loop covariate
+    y : (M,)
+        Continuous outcome
+
+    Returns
+    -------
+    tuple
+        [description]
+    """
+    G, X = da.asarray(G), da.asarray(X)
+
+    # Apply orthogonal projection to eliminate core covariates
+    # Note: dask lstsq will not work with numpy arrays
+    Gp = G - X @ da.linalg.lstsq(X, G)[0]
+    yp = y - X @ da.linalg.lstsq(X, y)[0]
+
+    # Estimate coefficients for each loop covariate
+    # Note: a key assumption here is that 0-mean residuals
+    # from projection require no extra terms in variance
+    # estimate for loop covariates (columns of G), which is
+    # only true when an intercept is present.
+    Gps = (Gp ** 2).sum(axis=0)
+    b = (Gp.T @ yp) / Gps
+
+    # Compute statistics and p values for each regression separately
+    # Note: dof w/ -2 includes loop covariate
+    dof = y.shape[0] - X.shape[1] - 2
+    y_resid = yp[:, np.newaxis] - Gp * b
+    rss = (y_resid ** 2).sum(axis=0)
+    t_val = b / np.sqrt((rss / dof) / Gps)
+    p_val = 2 * stats.distributions.t.sf(np.abs(t_val), dof)
+
+    return LinearRegressionResult(betas=b, t_values=t_val, p_values=p_val)
+
+
+def _get_loop_covariates(ds: Dataset, dosage: str = None):
+    if dosage is None:
+        G = ds["call/genotype"].sum(dim=DIM_PLOIDY)
+    else:
+        G = ds[dosage]
+    return da.asarray(G.data)
+
+
+def _get_core_covariates(
+    ds: Dataset, covariates: Sequence[str], add_intercept: bool = False
+):
+    if not add_intercept and not covariates:
+        raise ValueError(
+            "At least one covariate must be provided when `add_intercept`=False"
+        )
+    X = da.stack([da.asarray(ds[c].data) for c in covariates]).T
+    if add_intercept:
+        X = da.concatenate([da.ones((X.shape[0], 1)), X], axis=1)
+    # Note: dask qr decomp (used by lstsq) requires no chunking in one
+    # dimension, and because dim 0 will be far greater than the number
+    # of covariates for the large majority of use cases, chunking
+    # should be removed from dim 1
+    return X.rechunk((None, -1))
+
+
+def linear_regression(
+    ds: Dataset,
+    covariates: Sequence[str],
+    dosage: str,
+    trait: str,
+    add_intercept: bool = True,
+):
+    """Run linear regression to identify trait associations with genetic variation data
+
+    TODO: Explain vectorization scheme and add references
+
+    Warning: Regression statistics from this implementation are only valid when an intercept
+        is present. The `add_intercept` flag is a convenience for adding one when not
+        already present, but there is currently no parameterization for intercept-free regression.
+
+    Parameters
+    ----------
+    ds : Dataset
+        Dataset containing necessary dependent and independent variables
+    covariates : Sequence[str]
+        Covariate variable names
+    dosage : str
+        Dosage variable name where "dosage" array can contain represent
+        one of several possible quantities, e.g.:
+        - Alternate allele counts
+        - Recessive or dominant allele encodings
+        - True dosages as computed from imputed or probabilistic variant calls
+    trait : str
+        Trait (e.g. phenotype) variable name; must be continuous
+    add_intercept : bool, optional
+        Add intercept term to covariate set, by default True.
+
+    Returns
+    -------
+    Dataset
+        Regression result containing:
+        - betas: beta values associated with each independent variant regressed
+            against the trait
+        - t_values: T-test statistic for beta estimate
+        - p_values: P-value for beta estimate (float in [0, 1])
+    """
+    G = _get_loop_covariates(ds, dosage=dosage)
+    Z = _get_core_covariates(ds, covariates, add_intercept=add_intercept)
+    y = da.asarray(ds[trait].data)
+    res = _linear_regression(G.T, Z, y)
+    return xr.Dataset(
+        dict(
+            betas=(DIM_VARIANT, res.betas),
+            t_values=(DIM_VARIANT, res.t_values),
+            p_values=(DIM_VARIANT, res.p_values),
+        )
+    )
diff --git a/sgkit/stats/gwas.py b/sgkit/stats/gwas.py
deleted file mode 100644
index 72e7a4435..000000000
--- a/sgkit/stats/gwas.py
+++ /dev/null
@@ -1,88 +0,0 @@
-import collections
-from typing import Sequence
-
-import dask.array as da
-import numpy as np
-from dask.array import stats
-from xarray import Dataset
-
-from ..api import DIM_PLOIDY, DIM_VARIANT
-
-LinearRegressionResult = collections.namedtuple(
-    "LinearRegressionResult", ["betas", "t_values", "p_values"]
-)
-
-
-def _linear_regression(X, Z, y) -> LinearRegressionResult:
-    """Efficient linear regression estimation for multiple covariate sets
-
-    Parameters
-    ----------
-    X : (M, N) array-like
-        "Loop" covariates for which a separate regression will be fit to
-        individual columns
-    Z : (M, P) array-like
-        "Core" covariates that are included in the regressions along
-        with each loop covariate
-    y : (M,)
-        Continuous outcome
-
-    Returns
-    -------
-    tuple
-        [description]
-    """
-    # Apply orthogonal projection to eliminate core covariates
-    print("Z", Z)
-    print("X", X)
-    print("y", y)
-    Xp = X - Z @ da.linalg.lstsq(Z, X)[0]
-    yp = y - Z @ da.linalg.lstsq(Z, y)[0]
-
-    # Estimate coefficients for each loop covariate
-    Xps = (Xp ** 2).sum(axis=0)
-    b = (Xp.T @ yp) / Xps
-
-    # Compute statistics and p values for each regression separately
-    dof = y.shape[0] - Z.shape[1] - 1
-    y_resid = yp[:, np.newaxis] - Xp * b
-    rss = (y_resid ** 2).sum(axis=0)
-    t_val = b / np.sqrt((rss / dof) / Xps)
-    p_val = 2 * stats.distributions.t.sf(np.abs(t_val), dof)
-
-    return LinearRegressionResult(betas=b, t_values=t_val, p_values=p_val)
-
-
-def _get_loop_covariates(ds: Dataset, dosage: str = None):
-    if dosage is None:
-        X = ds["call/genotype"].sum(dim=DIM_PLOIDY)
-    else:
-        X = ds[dosage]
-    return da.asarray(X.data)
-
-
-def _get_core_covariates(
-    ds: Dataset, covariates: Sequence[str], add_intercept: bool = False
-):
-    X = da.stack([da.asarray(ds[c].data) for c in covariates]).T
-    if add_intercept:
-        X = da.concatenate([da.ones((X.shape[0], 1)), X], axis=1)
-    return X.rechunk((-1, -1))
-
-
-def linear_regression(
-    ds: Dataset,
-    covariates: Sequence[str],
-    dosage: str,
-    trait: str,
-    add_intercept: bool = True,
-):
-    X = _get_loop_covariates(ds, dosage=dosage)
-    Z = _get_core_covariates(ds, covariates, add_intercept=add_intercept)
-    y = da.asarray(ds[trait].data)
-    res = _linear_regression(X.T, Z, y)
-    return ds.assign(
-        betas=(DIM_VARIANT, res.betas),
-        t_values=(DIM_VARIANT, res.t_values),
-        p_values=(DIM_VARIANT, res.p_values),
-    )
diff --git a/sgkit/tests/test_association.py b/sgkit/tests/test_association.py
new file mode 100644
index 000000000..8415253b8
--- /dev/null
+++ b/sgkit/tests/test_association.py
@@ -0,0 +1,108 @@
+import numpy as np
+import pandas as pd
+import pytest
+import xarray as xr
+
+from sgkit.stats.association import linear_regression
+
+
+def _generate_test_data(n=100, m=10, p=3, e_std=0.001, b_zero_slice=None, seed=None):
+    """Test data simulator for multiple variant associations to a continuous outcome
+
+    Outcomes for each variant are simulated separately based on linear combinations
+    of randomly generated fixed effect covariates as well as the variant itself.
+
+    This does not add an intercept term in covariates.
+
+    Parameters
+    ----------
+    n : int, optional
+        Number of samples
+    m : int, optional
+        Number of variants
+    p : int, optional
+        Number of covariates
+    e_std : float, optional
+        Standard deviation for noise term
+    b_zero_slice : slice
+        Variant beta values to zero out, defaults to `slice(m // 2)`
+        meaning that the first half will all be 0.
+        Set to `slice(0)` to disable.
+
+    Returns
+    -------
+    n : int
+        Number of samples
+    m : int
+        Number of variants
+    p : int
+        Number of covariates
+    g : (n, m) array-like
+        Simulated genotype dosage
+    x : (n, p) array-like
+        Simulated covariates
+    bg : (m,) array-like
+        Variant betas
+    bx : (p,) array-like
+        Covariate betas
+    ys : (m, n) array-like
+        Outcomes for each column in genotypes i.e. variant
+    """
+    if b_zero_slice is None:
+        b_zero_slice = slice(m // 2)
+    np.random.seed(seed)
+    g = np.random.uniform(size=(n, m), low=0, high=2)
+    x = np.random.normal(size=(n, p))
+    bg = np.random.normal(size=m)
+    bg[b_zero_slice or slice(m // 2)] = 0
+    bx = np.random.normal(size=p)
+    e = np.random.normal(size=n, scale=e_std)
+
+    # Simulate y values using each variant independently
+    ys = np.array([g[:, i] * bg[i] + x @ bx + e for i in range(m)])
+    return n, m, p, g, x, bg, bx, ys
+
+
+def _generate_test_dataset(**kwargs):
+    n, m, p, g, x, bg, bx, ys = _generate_test_data(**kwargs)
+    data_vars = {}
+    # TODO: use literals or constants for dimension names?
+    data_vars["dosage"] = (["variant", "sample"], g.T)
+    for i in range(x.shape[1]):
+        data_vars[f"covar_{i}"] = (["sample"], x[:, i])
+    for i in range(len(ys)):
+        data_vars[f"trait_{i}"] = (["sample"], ys[i])
+    attrs = dict(betas=bg)
+    return xr.Dataset(data_vars, attrs=attrs)
+
+
+@pytest.fixture
+def ds():
+    return _generate_test_dataset()
+
+
+def get_statistics(ds, **kwargs):
+    dfr = []
+    for i in range(ds.dims["variant"]):
+        dsr = linear_regression(ds, dosage="dosage", trait=f"trait_{i}", **kwargs)
+        dfr.append(dsr.to_dataframe().iloc[[i]])
+    return pd.concat(dfr).reset_index(drop=True)
+
+
+def test_linear_regression_statistics(ds):
+    df = get_statistics(
+        ds, covariates=["covar_0", "covar_1", "covar_2"], add_intercept=True
+    )
+    # TODO: should we standardize on printing useful debugging info in the event of failures?
+    print(df)
+    np.testing.assert_allclose(df["betas"], ds.attrs["betas"], atol=1e-3)
+    mid_idx = ds.dims["variant"] // 2
+    assert np.all(df["p_values"].iloc[:mid_idx] > 0.05)
+    assert np.all(df["p_values"].iloc[mid_idx:] < 0.05)
+
+
+def test_linear_regression_raise_on_no_covars(ds):
+    with pytest.raises(ValueError):
+        linear_regression(
+            ds, covariates=[], dosage="dosage", trait="trait_0", add_intercept=False
+        )

From 2b83a9ff1d196979702c822fe548222f03a994a9 Mon Sep 17 00:00:00 2001
From: eric-czech <eric.allen.czech@gmail.com>
Date: Sat, 4 Jul 2020 13:30:40 +0000
Subject: [PATCH 07/20] Adding dask dependency

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 6fefda50d..d3fb40fc9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,3 @@
 numpy
 xarray
+dask

From 7806732b6b26c0893c84fdd32d2ec35a3b4cd76a Mon Sep 17 00:00:00 2001
From: eric-czech <eric.allen.czech@gmail.com>
Date: Sat, 4 Jul 2020 13:32:41 +0000
Subject: [PATCH 08/20] Adding dask dependency

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index d3fb40fc9..8909d53ea 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,3 @@
 numpy
 xarray
-dask
+dask[array]

From 2dc6f9197d203658b06998c9ca3128b041de05f2 Mon Sep 17 00:00:00 2001
From: eric-czech <eric.allen.czech@gmail.com>
Date: Sat, 4 Jul 2020 13:34:08 +0000
Subject: [PATCH 09/20] Adding scipy dependency

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 8909d53ea..894396a41 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
 numpy
 xarray
 dask[array]
+scipy

From 43c163ee84beb5fc80be4907712e5e3de9315dc9 Mon Sep 17 00:00:00 2001
From: eric-czech <eric.allen.czech@gmail.com>
Date: Sat, 4 Jul 2020 18:43:17 +0000
Subject: [PATCH 10/20] Adding statsmodels validation to tests

---
 README.md                       |  2 +-
 requirements-dev.txt            |  1 +
 setup.cfg                       |  2 +-
 sgkit/stats/association.py      | 35 +++++++++--------
 sgkit/tests/test_association.py | 67 ++++++++++++++++++++++++++-------
 5 files changed, 77 insertions(+), 30 deletions(-)

diff --git a/README.md b/README.md
index 14e4bad35..194aae428 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ pytest
 To check code coverage and get a coverage report, run
 
 ```bash
-pytest --cov=sgkit
+pytest --cov=sgkit --cov-report term-missing
 ```
 
 ### Code standards
diff --git a/requirements-dev.txt b/requirements-dev.txt
index afc8cb84e..f3411c4c3 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -5,3 +5,4 @@ mypy
 pre-commit
 pytest
 pytest-cov
+statsmodels
diff --git a/setup.cfg b/setup.cfg
index 3d335c6b4..700ae0930 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -50,7 +50,7 @@ ignore =
 [isort]
 default_section = THIRDPARTY
 known_first_party = sgkit
-known_third_party = dask,numpy,pandas,pytest,setuptools,xarray
+known_third_party = dask,numpy,pandas,pytest,setuptools,statsmodels,xarray
 multi_line_output = 3
 include_trailing_comma = True
 force_grid_wrap = 0
diff --git a/sgkit/stats/association.py b/sgkit/stats/association.py
index d84ffe82b..730287bf6 100644
--- a/sgkit/stats/association.py
+++ b/sgkit/stats/association.py
@@ -10,7 +10,7 @@
 from ..api import DIM_PLOIDY, DIM_VARIANT
 
 LinearRegressionResult = collections.namedtuple(
-    "LinearRegressionResult", ["betas", "t_values", "p_values"]
+    "LinearRegressionResult", ["beta", "t_value", "p_value"]
 )
 
 
@@ -30,13 +30,16 @@ def _linear_regression(G, X, y) -> LinearRegressionResult:
 
     Returns
     -------
-    tuple
-        [description]
+    LinearRegressionResult
+        Regression statistics and coefficient estimates
     """
     G, X = da.asarray(G), da.asarray(X)
 
     # Apply orthogonal projection to eliminate core covariates
-    # Note: dask lstsq will not work with numpy arrays
+    # Note: QR factorization or SVD should be used here to find
+    # what are effectively OLS residuals rather than matrix inverse
+    # to avoid need for MxM array; additionally, dask.lstsq will not
+    # work with numpy arrays
     Gp = G - X @ da.linalg.lstsq(X, G)[0]
     yp = y - X @ da.linalg.lstsq(X, y)[0]
 
@@ -50,18 +53,20 @@ def _linear_regression(G, X, y) -> LinearRegressionResult:
 
     # Compute statistics and p values for each regression separately
     # Note: dof w/ -2 includes loop covariate
-    dof = y.shape[0] - X.shape[1] - 2
+    dof = y.shape[0] - X.shape[1] - 1
     y_resid = yp[:, np.newaxis] - Gp * b
     rss = (y_resid ** 2).sum(axis=0)
     t_val = b / np.sqrt((rss / dof) / Gps)
     p_val = 2 * stats.distributions.t.sf(np.abs(t_val), dof)
 
-    return LinearRegressionResult(betas=b, t_values=t_val, p_values=p_val)
+    return LinearRegressionResult(beta=b, t_value=t_val, p_value=p_val)
 
 
 def _get_loop_covariates(ds: Dataset, dosage: str = None):
     if dosage is None:
-        G = ds["call/genotype"].sum(dim=DIM_PLOIDY)
+        # TODO: This should be (probably gwas-specific) allele
+        # count with sex chromosome considerations
+        G = ds["call/genotype"].sum(dim=DIM_PLOIDY)  # pragma: no cover
     else:
         G = ds[dosage]
     return da.asarray(G.data)
@@ -112,18 +117,18 @@ def linear_regression(
         - Recessive or dominant allele encodings
         - True dosages as computed from imputed or probabilistic variant calls
     trait : str
-        Trait (e.g. phenotype) variable name; must be continuous
+        Trait (e.g. phenotype) variable name, must be continuous
     add_intercept : bool, optional
-        Add intercept term to covariate set, by default True.
+        Add intercept term to covariate set, by default True
 
     Returns
     -------
     Dataset
         Regression result containing:
-        - betas: beta values associated with each independent variant regressed
+        - beta: beta values associated with each independent variant regressed
             against the trait
-        - t_values: T-test statistic for beta estimate
-        - p_values: P-value for beta estimate (float in [0, 1])
+        - t_value: T-test statistic for beta estimate
+        - p_value: P-value for beta estimate (unscaled float in [0, 1])
     """
     G = _get_loop_covariates(ds, dosage=dosage)
     Z = _get_core_covariates(ds, covariates, add_intercept=add_intercept)
@@ -131,8 +136,8 @@ def linear_regression(
     res = _linear_regression(G.T, Z, y)
     return xr.Dataset(
         dict(
-            betas=(DIM_VARIANT, res.betas),
-            t_values=(DIM_VARIANT, res.t_values),
-            p_values=(DIM_VARIANT, res.p_values),
+            beta=(DIM_VARIANT, res.beta),
+            t_value=(DIM_VARIANT, res.t_value),
+            p_value=(DIM_VARIANT, res.p_value),
         )
     )
diff --git a/sgkit/tests/test_association.py b/sgkit/tests/test_association.py
index 8415253b8..612820c04 100644
--- a/sgkit/tests/test_association.py
+++ b/sgkit/tests/test_association.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+import statsmodels.api as sm
 import xarray as xr
 
 from sgkit.stats.association import linear_regression
@@ -72,7 +73,7 @@ def _generate_test_dataset(**kwargs):
         data_vars[f"covar_{i}"] = (["sample"], x[:, i])
     for i in range(len(ys)):
         data_vars[f"trait_{i}"] = (["sample"], ys[i])
-    attrs = dict(betas=bg)
+    attrs = dict(beta=bg)
     return xr.Dataset(data_vars, attrs=attrs)
 
 
@@ -81,24 +82,64 @@ def ds():
     return _generate_test_dataset()
 
 
-def get_statistics(ds, **kwargs):
-    dfr = []
+def _sm_statistics(ds, i, add_intercept):
+    X = []
+    # Make sure first independent variable is variant
+    X.append(ds["dosage"].values[i])
+    for v in [c for c in list(ds.keys()) if c.startswith("covar_")]:
+        X.append(ds[v].values)
+    if add_intercept:
+        X.append(np.ones(ds.dims["sample"]))
+    X = np.stack(X).T
+    y = ds[f"trait_{i}"].values
+
+    return sm.OLS(y, X, hasconst=True).fit()
+
+
+def _get_statistics(ds, add_intercept, **kwargs):
+    df_pred, df_true = [], []
     for i in range(ds.dims["variant"]):
-        dsr = linear_regression(ds, dosage="dosage", trait=f"trait_{i}", **kwargs)
-        dfr.append(dsr.to_dataframe().iloc[[i]])
-    return pd.concat(dfr).reset_index(drop=True)
+        dsr = linear_regression(
+            ds,
+            dosage="dosage",
+            trait=f"trait_{i}",
+            add_intercept=add_intercept,
+            **kwargs,
+        )
+        res = _sm_statistics(ds, i, add_intercept)
+        df_pred.append(dsr.to_dataframe().iloc[i].to_dict())
+        df_true.append(dict(t_value=res.tvalues[0], p_value=res.pvalues[0]))
+    return pd.DataFrame(df_pred), pd.DataFrame(df_true)
 
 
 def test_linear_regression_statistics(ds):
-    df = get_statistics(
+    def validate(dfp, dft):
+        # TODO: should we standardize on printing useful debugging info in the event of failures?
+        print(dfp)
+        print(dft)
+
+        # Validate results at a higher level, looking only for recapitulation
+        # of more obvious inferences based on how the data was simulated
+        np.testing.assert_allclose(dfp["beta"], ds.attrs["beta"], atol=1e-3)
+        mid_idx = ds.dims["variant"] // 2
+        assert np.all(dfp["p_value"].iloc[:mid_idx] > 0.05)
+        assert np.all(dfp["p_value"].iloc[mid_idx:] < 0.05)
+
+        # Validate more precisely against statsmodels results
+        np.testing.assert_allclose(dfp["t_value"], dft["t_value"])
+        np.testing.assert_allclose(dfp["p_value"], dft["p_value"])
+
+    dfp, dft = _get_statistics(
         ds, covariates=["covar_0", "covar_1", "covar_2"], add_intercept=True
     )
-    # TODO: should we standardize on printing useful debugging info in the event of failures?
-    print(df)
-    np.testing.assert_allclose(df["betas"], ds.attrs["betas"], atol=1e-3)
-    mid_idx = ds.dims["variant"] // 2
-    assert np.all(df["p_values"].iloc[:mid_idx] > 0.05)
-    assert np.all(df["p_values"].iloc[mid_idx:] < 0.05)
+    validate(dfp, dft)
+
+    dfp, dft = _get_statistics(
+        ds.assign(covar_3=("sample", np.ones(ds.dims["sample"]))),
+        covariates=["covar_0", "covar_1", "covar_2", "covar_3"],
+        add_intercept=False,
+    )
+    validate(dfp, dft)
 
 
 def test_linear_regression_raise_on_no_covars(ds):

From 3ca89dc0ab6b53c4e5b327fc02cd2394e996a6f9 Mon Sep 17 00:00:00 2001
From: eric-czech <eric.allen.czech@gmail.com>
Date: Sun, 5 Jul 2020 11:52:16 +0000
Subject: [PATCH 11/20] Adding docs

---
 sgkit/stats/association.py | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/sgkit/stats/association.py b/sgkit/stats/association.py
index 730287bf6..5cab2436f 100644
--- a/sgkit/stats/association.py
+++ b/sgkit/stats/association.py
@@ -44,7 +44,7 @@ def _linear_regression(G, X, y) -> LinearRegressionResult:
     yp = y - X @ da.linalg.lstsq(X, y)[0]
 
     # Estimate coefficients for each loop covariate
-    # Note: a key assumption here is that 0-mean residuals
+    # Note: A key assumption here is that 0-mean residuals
     # from projection require no extra terms in variance
     # estimate for loop covariates (columns of G), which is
     # only true when an intercept is present.
@@ -52,11 +52,10 @@ def _linear_regression(G, X, y) -> LinearRegressionResult:
     b = (Gp.T @ yp) / Gps
 
     # Compute statistics and p values for each regression separately
-    # Note: dof w/ -2 includes loop covariate
     dof = y.shape[0] - X.shape[1] - 1
     y_resid = yp[:, np.newaxis] - Gp * b
     rss = (y_resid ** 2).sum(axis=0)
-    t_val = b / np.sqrt((rss / dof) / Gps)
+    t_val = b / np.sqrt(rss / dof / Gps)
     p_val = 2 * stats.distributions.t.sf(np.abs(t_val), dof)
 
     return LinearRegressionResult(beta=b, t_value=t_val, p_value=p_val)
@@ -96,13 +95,19 @@ def linear_regression(
     trait: str,
     add_intercept: bool = True,
 ):
-    """Run linear regression to identify trait associations with genetic variation data
+    """Run linear regression to identify continuous trait associations with genetic variants
 
-    TODO: Explain vectorization scheme and add references
+    This method solves OLS regressions for each variant simultaneously and reports
+    effect statistics as defined in [1]. This is facilitated by the removal of
+    sample (i.e. person/individual) covariates through orthogonal projection
+    of both the genetic variant and phenotype data [2]. A consequence of this
+    rotation is that effect sizes and significances cannot be reported for
+    covariates, only variants.
 
-    Warning: Regression statistics from this implementation are only valid when an intercept
-        is present. The `add_intercept` flag is a convenience for adding one when not
-        already present, but there is currently no parameterization for intercept-free regression.
+    Warning: Regression statistics from this implementation are only valid when an
+    intercept is present. The `add_intercept` flag is a convenience for adding one
+    when not already present, but there is currently no parameterization for
+    intercept-free regression.
 
     Parameters
     ----------
@@ -121,6 +126,15 @@ def linear_regression(
     add_intercept : bool, optional
         Add intercept term to covariate set, by default True
 
+    References
+    ----------
+    - [1] Friedman, Jerome, Trevor Hastie, and Robert Tibshirani. 2001. The Elements
+        of Statistical Learning. Vol. 1. Springer series in statistics New York.
+    - [2] Loh, Po-Ru, George Tucker, Brendan K. Bulik-Sullivan, Bjarni J. Vilhjálmsson,
+        Hilary K. Finucane, Rany M. Salem, Daniel I. Chasman, et al. 2015. “Efficient
+        Bayesian Mixed-Model Analysis Increases Association Power in Large Cohorts.”
+        Nature Genetics 47 (3): 284–90.
+
     Returns
     -------
     Dataset

From 461e1f6e05dd27f25ac3afa3aa4f06db2891fbb7 Mon Sep 17 00:00:00 2001
From: eric-czech <eric.allen.czech@gmail.com>
Date: Sun, 5 Jul 2020 12:04:01 +0000
Subject: [PATCH 12/20] Switch to random state instance for simulated data

---
 sgkit/tests/test_association.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/sgkit/tests/test_association.py b/sgkit/tests/test_association.py
index 612820c04..6ec80ae0c 100644
--- a/sgkit/tests/test_association.py
+++ b/sgkit/tests/test_association.py
@@ -7,7 +7,7 @@
 from sgkit.stats.association import linear_regression
 
 
-def _generate_test_data(n=100, m=10, p=3, e_std=0.001, b_zero_slice=None, seed=None):
+def _generate_test_data(n=100, m=10, p=3, e_std=0.001, b_zero_slice=None, seed=1):
     """Test data simulator for multiple variant associations to a continuous outcome
 
     Outcomes for each variant are simulated separately based on linear combinations
@@ -51,13 +51,13 @@ def _generate_test_data(n=100, m=10, p=3, e_std=0.001, b_zero_slice=None, seed=N
     """
     if b_zero_slice is None:
         b_zero_slice = slice(m // 2)
-    np.random.seed(seed)
-    g = np.random.uniform(size=(n, m), low=0, high=2)
-    x = np.random.normal(size=(n, p))
-    bg = np.random.normal(size=m)
+    rs = np.random.RandomState(seed)
+    g = rs.uniform(size=(n, m), low=0, high=2)
+    x = rs.normal(size=(n, p))
+    bg = rs.normal(size=m)
     bg[b_zero_slice or slice(m // 2)] = 0
-    bx = np.random.normal(size=p)
-    e = np.random.normal(size=n, scale=e_std)
+    bx = rs.normal(size=p)
+    e = rs.normal(size=n, scale=e_std)
 
     # Simulate y values using each variant independently
     ys = np.array([g[:, i] * bg[i] + x @ bx + e for i in range(m)])
@@ -114,7 +114,6 @@ def _get_statistics(ds, add_intercept, **kwargs):
 
 def test_linear_regression_statistics(ds):
     def validate(dfp, dft):
-        # TODO: should we standardize on printing useful debugging info in the event of failures?
         print(dfp)
         print(dft)
 

From be2cfc578134ab8ccb45534d5cce60ef3e2bf3fc Mon Sep 17 00:00:00 2001
From: eric-czech <eric.allen.czech@gmail.com>
Date: Sun, 5 Jul 2020 12:05:35 +0000
Subject: [PATCH 13/20] Adding coveragerc to remove tests from coverage check

---
 .coveragerc | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 .coveragerc

diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 000000000..083aa052e
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,3 @@
+[run]
+omit =
+    sgkit/tests/*

From 0627f5221577bc520416824fa245d290b236fe46 Mon Sep 17 00:00:00 2001
From: eric-czech <eric.allen.czech@gmail.com>
Date: Sun, 5 Jul 2020 12:32:02 +0000
Subject: [PATCH 14/20] Update reference

---
 sgkit/stats/association.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sgkit/stats/association.py b/sgkit/stats/association.py
index 5cab2436f..3c97aa3e6 100644
--- a/sgkit/stats/association.py
+++ b/sgkit/stats/association.py
@@ -128,8 +128,9 @@ def linear_regression(
 
     References
     ----------
-    - [1] Friedman, Jerome, Trevor Hastie, and Robert Tibshirani. 2001. The Elements
-        of Statistical Learning. Vol. 1. Springer series in statistics New York.
+    - [1] Hastie, Trevor, Robert Tibshirani, and Jerome Friedman. 2009. The Elements
+        of Statistical Learning: Data Mining, Inference, and Prediction, Second Edition.
+        Springer Science & Business Media.
     - [2] Loh, Po-Ru, George Tucker, Brendan K. Bulik-Sullivan, Bjarni J. Vilhjálmsson,
         Hilary K. Finucane, Rany M. Salem, Daniel I. Chasman, et al. 2015. “Efficient
         Bayesian Mixed-Model Analysis Increases Association Power in Large Cohorts.”

From c76c720b67913a72df9eddeabdad3d53f9ae0ce7 Mon Sep 17 00:00:00 2001
From: eric-czech <eric.allen.czech@gmail.com>
Date: Mon, 6 Jul 2020 11:19:10 +0000
Subject: [PATCH 15/20] Add missing terms to coverage report in build

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 9c1f6fc8f..faa91b5fe 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -21,4 +21,4 @@ jobs:
       uses: pre-commit/action@v2.0.0
     - name: Test with pytest and coverage
       run: |
-        pytest -v --cov=sgkit
+        pytest -v --cov=sgkit --cov-report term-missing

From 505fba057aba015e691f0efbc61b2191bcf54e0d Mon Sep 17 00:00:00 2001
From: eric-czech <eric.allen.czech@gmail.com>
Date: Mon, 6 Jul 2020 13:39:58 +0000
Subject: [PATCH 16/20] Return type annotations

---
 sgkit/stats/association.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sgkit/stats/association.py b/sgkit/stats/association.py
index 3c97aa3e6..9db8b5e8a 100644
--- a/sgkit/stats/association.py
+++ b/sgkit/stats/association.py
@@ -4,7 +4,7 @@
 import dask.array as da
 import numpy as np
 import xarray as xr
-from dask.array import stats
+from dask.array import Array, stats
 from xarray import Dataset
 
 from ..api import DIM_PLOIDY, DIM_VARIANT
@@ -61,7 +61,7 @@ def _linear_regression(G, X, y) -> LinearRegressionResult:
     return LinearRegressionResult(beta=b, t_value=t_val, p_value=p_val)
 
 
-def _get_loop_covariates(ds: Dataset, dosage: str = None):
+def _get_loop_covariates(ds: Dataset, dosage: str = None) -> Array:
     if dosage is None:
         # TODO: This should be (probably gwas-specific) allele
         # count with sex chromosome considerations
@@ -73,7 +73,7 @@ def _get_loop_covariates(ds: Dataset, dosage: str = None):
 
 def _get_core_covariates(
     ds: Dataset, covariates: Sequence[str], add_intercept: bool = False
-):
+) -> Array:
     if not add_intercept and not covariates:
         raise ValueError(
             "At least one covariate must be provided when `add_intercept`=False"
@@ -94,7 +94,7 @@ def linear_regression(
     dosage: str,
     trait: str,
     add_intercept: bool = True,
-):
+) -> Dataset:
     """Run linear regression to identify continuous trait associations with genetic variants
 
     This method solves OLS regressions for each variant simultaneously and reports

From 2c386e21f72e10d48c35c99794fdddef6ad92509 Mon Sep 17 00:00:00 2001
From: eric-czech <eric.allen.czech@gmail.com>
Date: Thu, 9 Jul 2020 15:38:58 -0400
Subject: [PATCH 17/20] Filter statsmodels warning on import, remove dim
 constants, change output variable names

---
 setup.cfg                       |  2 +-
 sgkit/__init__.py               |  2 ++
 sgkit/stats/association.py      | 14 ++++++--------
 sgkit/tests/test_association.py | 17 +++++++++++++++--
 4 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 700ae0930..3d335c6b4 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -50,7 +50,7 @@ ignore =
 [isort]
 default_section = THIRDPARTY
 known_first_party = sgkit
-known_third_party = dask,numpy,pandas,pytest,setuptools,statsmodels,xarray
+known_third_party = dask,numpy,pandas,pytest,setuptools,xarray
 multi_line_output = 3
 include_trailing_comma = True
 force_grid_wrap = 0
diff --git a/sgkit/__init__.py b/sgkit/__init__.py
index 8b170a470..3340b680c 100644
--- a/sgkit/__init__.py
+++ b/sgkit/__init__.py
@@ -5,6 +5,7 @@
     DIM_VARIANT,
     create_genotype_call_dataset,
 )
+from .stats.association import linear_regression
 
 __all__ = [
     "DIM_ALLELE",
@@ -12,4 +13,5 @@
     "DIM_SAMPLE",
     "DIM_VARIANT",
     "create_genotype_call_dataset",
+    "linear_regression",
 ]
diff --git a/sgkit/stats/association.py b/sgkit/stats/association.py
index 9db8b5e8a..d9939efd8 100644
--- a/sgkit/stats/association.py
+++ b/sgkit/stats/association.py
@@ -7,8 +7,6 @@
 from dask.array import Array, stats
 from xarray import Dataset
 
-from ..api import DIM_PLOIDY, DIM_VARIANT
-
 LinearRegressionResult = collections.namedtuple(
     "LinearRegressionResult", ["beta", "t_value", "p_value"]
 )
@@ -65,7 +63,7 @@ def _get_loop_covariates(ds: Dataset, dosage: str = None) -> Array:
     if dosage is None:
         # TODO: This should be (probably gwas-specific) allele
         # count with sex chromosome considerations
-        G = ds["call/genotype"].sum(dim=DIM_PLOIDY)  # pragma: no cover
+        G = ds["call/genotype"].sum(dim="ploidy")  # pragma: no cover
     else:
         G = ds[dosage]
     return da.asarray(G.data)
@@ -150,9 +148,9 @@ def linear_regression(
     y = da.asarray(ds[trait].data)
     res = _linear_regression(G.T, Z, y)
     return xr.Dataset(
-        dict(
-            beta=(DIM_VARIANT, res.beta),
-            t_value=(DIM_VARIANT, res.t_value),
-            p_value=(DIM_VARIANT, res.p_value),
-        )
+        {
+            "variant/beta": ("variants", res.beta),
+            "variant/t_value": ("variants", res.t_value),
+            "variant/p_value": ("variants", res.p_value),
+        }
     )
diff --git a/sgkit/tests/test_association.py b/sgkit/tests/test_association.py
index 6ec80ae0c..5b852f09d 100644
--- a/sgkit/tests/test_association.py
+++ b/sgkit/tests/test_association.py
@@ -1,11 +1,19 @@
+import warnings
+
 import numpy as np
 import pandas as pd
 import pytest
-import statsmodels.api as sm
 import xarray as xr
 
 from sgkit.stats.association import linear_regression
 
+with warnings.catch_warnings():
+    warnings.simplefilter("ignore", DeprecationWarning)
+    # Ignore: DeprecationWarning: Using or importing the ABCs from 'collections'
+    # instead of from 'collections.abc' is deprecated since Python 3.3,
+    # and in 3.9 it will stop working
+    import statsmodels.api as sm
+
 
 def _generate_test_data(n=100, m=10, p=3, e_std=0.001, b_zero_slice=None, seed=1):
     """Test data simulator for multiple variant associations to a continuous outcome
@@ -107,7 +115,12 @@ def _get_statistics(ds, add_intercept, **kwargs):
             **kwargs,
         )
         res = _sm_statistics(ds, i, add_intercept)
-        df_pred.append(dsr.to_dataframe().iloc[i].to_dict())
+        df_pred.append(
+            dsr.to_dataframe()
+            .rename(columns=lambda c: c.replace("variant/", ""))
+            .iloc[i]
+            .to_dict()
+        )
         df_true.append(dict(t_value=res.tvalues[0], p_value=res.pvalues[0]))
     return pd.DataFrame(df_pred), pd.DataFrame(df_true)
 

From a4835a295c667deb6f08e01bb24b6432d3cc3fa8 Mon Sep 17 00:00:00 2001
From: eric-czech <eric.allen.czech@gmail.com>
Date: Fri, 10 Jul 2020 17:40:41 -0400
Subject: [PATCH 18/20] Renaming linear_regression

---
 sgkit/__init__.py               | 4 ++--
 sgkit/stats/association.py      | 6 +++---
 sgkit/tests/test_association.py | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/sgkit/__init__.py b/sgkit/__init__.py
index 3340b680c..6417b5646 100644
--- a/sgkit/__init__.py
+++ b/sgkit/__init__.py
@@ -5,7 +5,7 @@
     DIM_VARIANT,
     create_genotype_call_dataset,
 )
-from .stats.association import linear_regression
+from .stats.association import gwas_linear_regression
 
 __all__ = [
     "DIM_ALLELE",
@@ -13,5 +13,5 @@
     "DIM_SAMPLE",
     "DIM_VARIANT",
     "create_genotype_call_dataset",
-    "linear_regression",
+    "gwas_linear_regression",
 ]
diff --git a/sgkit/stats/association.py b/sgkit/stats/association.py
index d9939efd8..a4e6194d8 100644
--- a/sgkit/stats/association.py
+++ b/sgkit/stats/association.py
@@ -12,7 +12,7 @@
 )
 
 
-def _linear_regression(G, X, y) -> LinearRegressionResult:
+def _gwas_linear_regression(G, X, y) -> LinearRegressionResult:
     """Efficient linear regression estimation for multiple covariate sets
 
     Parameters
@@ -86,7 +86,7 @@ def _get_core_covariates(
     return X.rechunk((None, -1))
 
 
-def linear_regression(
+def gwas_linear_regression(
     ds: Dataset,
     covariates: Sequence[str],
     dosage: str,
@@ -146,7 +146,7 @@ def linear_regression(
     G = _get_loop_covariates(ds, dosage=dosage)
     Z = _get_core_covariates(ds, covariates, add_intercept=add_intercept)
     y = da.asarray(ds[trait].data)
-    res = _linear_regression(G.T, Z, y)
+    res = _gwas_linear_regression(G.T, Z, y)
     return xr.Dataset(
         {
             "variant/beta": ("variants", res.beta),
diff --git a/sgkit/tests/test_association.py b/sgkit/tests/test_association.py
index 5b852f09d..533cf9be7 100644
--- a/sgkit/tests/test_association.py
+++ b/sgkit/tests/test_association.py
@@ -5,7 +5,7 @@
 import pytest
 import xarray as xr
 
-from sgkit.stats.association import linear_regression
+from sgkit.stats.association import gwas_linear_regression
 
 with warnings.catch_warnings():
     warnings.simplefilter("ignore", DeprecationWarning)
@@ -107,7 +107,7 @@ def _sm_statistics(ds, i, add_intercept):
 def _get_statistics(ds, add_intercept, **kwargs):
     df_pred, df_true = [], []
     for i in range(ds.dims["variant"]):
-        dsr = linear_regression(
+        dsr = gwas_linear_regression(
             ds,
             dosage="dosage",
             trait=f"trait_{i}",
@@ -156,6 +156,6 @@ def validate(dfp, dft):
 
 def test_linear_regression_raise_on_no_covars(ds):
     with pytest.raises(ValueError):
-        linear_regression(
+        gwas_linear_regression(
             ds, covariates=[], dosage="dosage", trait="trait_0", add_intercept=False
         )

From f2c0352fe5da2595cc92fa1eda43eabc71d57075 Mon Sep 17 00:00:00 2001
From: eric-czech <eric.allen.czech@gmail.com>
Date: Fri, 17 Jul 2020 09:55:53 -0400
Subject: [PATCH 19/20] Fixing mypy errors

---
 .pre-commit-config.yaml         |  2 +-
 setup.cfg                       |  6 ++++
 sgkit/stats/association.py      | 10 +++++--
 sgkit/tests/test_association.py | 50 +++++++++++++++++++--------------
 sgkit/typing.py                 |  6 +++-
 5 files changed, 48 insertions(+), 26 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0b114f16f..1dc9d203a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -29,5 +29,5 @@ repos:
     rev: v0.782
     hooks:
       - id: mypy
-        args: ["--strict"]
+        args: ["--strict", "--show-error-codes"]
         additional_dependencies: ["numpy", "xarray"]
diff --git a/setup.cfg b/setup.cfg
index f2c098dcd..c8097812a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -60,8 +60,14 @@ line_length = 88
 
 [mypy-numpy.*]
 ignore_missing_imports = True
+[mypy-pandas.*]
+ignore_missing_imports = True
+[mypy-dask.*]
+ignore_missing_imports = True
 [mypy-pytest.*]
 ignore_missing_imports = True
+[mypy-statsmodels.*]
+ignore_missing_imports = True
 [mypy-setuptools]
 ignore_missing_imports = True
 [mypy-sgkit.tests.*]
diff --git a/sgkit/stats/association.py b/sgkit/stats/association.py
index a4e6194d8..1f280bf36 100644
--- a/sgkit/stats/association.py
+++ b/sgkit/stats/association.py
@@ -1,5 +1,5 @@
 import collections
-from typing import Sequence
+from typing import Optional, Sequence
 
 import dask.array as da
 import numpy as np
@@ -7,12 +7,16 @@
 from dask.array import Array, stats
 from xarray import Dataset
 
+from ..typing import ArrayLike
+
 LinearRegressionResult = collections.namedtuple(
     "LinearRegressionResult", ["beta", "t_value", "p_value"]
 )
 
 
-def _gwas_linear_regression(G, X, y) -> LinearRegressionResult:
+def _gwas_linear_regression(
+    G: ArrayLike, X: ArrayLike, y: ArrayLike
+) -> LinearRegressionResult:
     """Efficient linear regression estimation for multiple covariate sets
 
     Parameters
@@ -59,7 +63,7 @@ def _gwas_linear_regression(G, X, y) -> LinearRegressionResult:
     return LinearRegressionResult(beta=b, t_value=t_val, p_value=p_val)
 
 
-def _get_loop_covariates(ds: Dataset, dosage: str = None) -> Array:
+def _get_loop_covariates(ds: Dataset, dosage: Optional[str] = None) -> Array:
     if dosage is None:
         # TODO: This should be (probably gwas-specific) allele
         # count with sex chromosome considerations
diff --git a/sgkit/tests/test_association.py b/sgkit/tests/test_association.py
index 533cf9be7..2a3a8b69b 100644
--- a/sgkit/tests/test_association.py
+++ b/sgkit/tests/test_association.py
@@ -1,11 +1,15 @@
 import warnings
+from typing import Any, Dict, List, Optional, Tuple
 
 import numpy as np
 import pandas as pd
 import pytest
 import xarray as xr
+from pandas import DataFrame
+from xarray import Dataset
 
 from sgkit.stats.association import gwas_linear_regression
+from sgkit.typing import ArrayLike
 
 with warnings.catch_warnings():
     warnings.simplefilter("ignore", DeprecationWarning)
@@ -13,9 +17,17 @@
     # instead of from 'collections.abc' is deprecated since Python 3.3,
     # and in 3.9 it will stop working
     import statsmodels.api as sm
+    from statsmodels.regression.linear_model import RegressionResultsWrapper
 
 
-def _generate_test_data(n=100, m=10, p=3, e_std=0.001, b_zero_slice=None, seed=1):
+def _generate_test_data(
+    n: int = 100,
+    m: int = 10,
+    p: int = 3,
+    e_std: float = 0.001,
+    b_zero_slice: Optional[slice] = None,
+    seed: Optional[int] = 1,
+) -> Tuple[ArrayLike, ArrayLike, ArrayLike, ArrayLike]:
     """Test data simulator for multiple variant associations to a continuous outcome
 
     Outcomes for each variant are simulated separately based on linear combinations
@@ -40,20 +52,12 @@ def _generate_test_data(n=100, m=10, p=3, e_std=0.001, b_zero_slice=None, seed=1
 
     Returns
     -------
-    n : int
-        Number of samples
-    m : int
-        Number of variants
-    p : int
-        Number of covariates
     g : (n, m) array-like
         Simulated genotype dosage
     x : (n, p) array-like
         Simulated covariates
     bg : (m,) array-like
         Variant betas
-    bx : (p,) array-like
-        Covariate betas
     ys : (m, n) array-like
         Outcomes for each column in genotypes i.e. variant
     """
@@ -69,28 +73,29 @@ def _generate_test_data(n=100, m=10, p=3, e_std=0.001, b_zero_slice=None, seed=1
 
     # Simulate y values using each variant independently
     ys = np.array([g[:, i] * bg[i] + x @ bx + e for i in range(m)])
-    return n, m, p, g, x, bg, bx, ys
+    return g, x, bg, ys
 
 
-def _generate_test_dataset(**kwargs):
-    n, m, p, g, x, bg, bx, ys = _generate_test_data(**kwargs)
+def _generate_test_dataset(**kwargs: Any) -> Dataset:
+    g, x, bg, ys = _generate_test_data(**kwargs)
     data_vars = {}
-    # TODO: use literals or constants for dimension names?
     data_vars["dosage"] = (["variant", "sample"], g.T)
     for i in range(x.shape[1]):
         data_vars[f"covar_{i}"] = (["sample"], x[:, i])
     for i in range(len(ys)):
         data_vars[f"trait_{i}"] = (["sample"], ys[i])
     attrs = dict(beta=bg)
-    return xr.Dataset(data_vars, attrs=attrs)
+    return xr.Dataset(data_vars, attrs=attrs)  # type: ignore[arg-type]
 
 
-@pytest.fixture
-def ds():
+@pytest.fixture  # type: ignore[misc]
+def ds() -> Dataset:
     return _generate_test_dataset()
 
 
-def _sm_statistics(ds, i, add_intercept):
+def _sm_statistics(
+    ds: Dataset, i: int, add_intercept: bool
+) -> RegressionResultsWrapper:
     X = []
     # Make sure first independent variable is variant
     X.append(ds["dosage"].values[i])
@@ -104,8 +109,11 @@ def _sm_statistics(ds, i, add_intercept):
     return sm.OLS(y, X, hasconst=True).fit()
 
 
-def _get_statistics(ds, add_intercept, **kwargs):
-    df_pred, df_true = [], []
+def _get_statistics(
+    ds: Dataset, add_intercept: bool, **kwargs: Any
+) -> Tuple[DataFrame, DataFrame]:
+    df_pred: List[Dict[str, Any]] = []
+    df_true: List[Dict[str, Any]] = []
     for i in range(ds.dims["variant"]):
         dsr = gwas_linear_regression(
             ds,
@@ -116,7 +124,7 @@ def _get_statistics(ds, add_intercept, **kwargs):
         )
         res = _sm_statistics(ds, i, add_intercept)
         df_pred.append(
-            dsr.to_dataframe()
+            dsr.to_dataframe()  # type: ignore[no-untyped-call]
             .rename(columns=lambda c: c.replace("variant/", ""))
             .iloc[i]
             .to_dict()
@@ -126,7 +134,7 @@ def _get_statistics(ds, add_intercept, **kwargs):
 
 
 def test_linear_regression_statistics(ds):
-    def validate(dfp, dft):
+    def validate(dfp: DataFrame, dft: DataFrame) -> None:
         print(dfp)
         print(dft)
 
diff --git a/sgkit/typing.py b/sgkit/typing.py
index 3dea6b2be..fab53b2a9 100644
--- a/sgkit/typing.py
+++ b/sgkit/typing.py
@@ -1,3 +1,7 @@
-from typing import Any
+from typing import Any, Union
+
+import dask.array as da
+import numpy as np
 
 DType = Any
+ArrayLike = Union[np.ndarray, da.Array]

From 273375e5d949ae3cd6b22572a6fccb2bd3064f0f Mon Sep 17 00:00:00 2001
From: eric-czech <eric.allen.czech@gmail.com>
Date: Fri, 17 Jul 2020 11:17:15 -0400
Subject: [PATCH 20/20] Requested changes

---
 .pre-commit-config.yaml         |  2 +-
 sgkit/stats/association.py      | 11 +++++++----
 sgkit/tests/test_association.py |  4 ++--
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1dc9d203a..48b567f70 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -30,4 +30,4 @@ repos:
     hooks:
       - id: mypy
         args: ["--strict", "--show-error-codes"]
-        additional_dependencies: ["numpy", "xarray"]
+        additional_dependencies: ["numpy", "xarray", "dask[array]", "scipy"]
diff --git a/sgkit/stats/association.py b/sgkit/stats/association.py
index 1f280bf36..65e7061f6 100644
--- a/sgkit/stats/association.py
+++ b/sgkit/stats/association.py
@@ -1,4 +1,4 @@
-import collections
+from dataclasses import dataclass
 from typing import Optional, Sequence
 
 import dask.array as da
@@ -9,9 +9,12 @@
 
 from ..typing import ArrayLike
 
-LinearRegressionResult = collections.namedtuple(
-    "LinearRegressionResult", ["beta", "t_value", "p_value"]
-)
+
+@dataclass
+class LinearRegressionResult:
+    beta: ArrayLike
+    t_value: ArrayLike
+    p_value: ArrayLike
 
 
 def _gwas_linear_regression(
diff --git a/sgkit/tests/test_association.py b/sgkit/tests/test_association.py
index 2a3a8b69b..97407a6a5 100644
--- a/sgkit/tests/test_association.py
+++ b/sgkit/tests/test_association.py
@@ -88,7 +88,7 @@ def _generate_test_dataset(**kwargs: Any) -> Dataset:
     return xr.Dataset(data_vars, attrs=attrs)  # type: ignore[arg-type]
 
 
-@pytest.fixture  # type: ignore[misc]
+@pytest.fixture(scope="module")  # type: ignore[misc]
 def ds() -> Dataset:
     return _generate_test_dataset()
 
@@ -163,7 +163,7 @@ def validate(dfp: DataFrame, dft: DataFrame) -> None:
 
 
 def test_linear_regression_raise_on_no_covars(ds):
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="At least one covariate must be provided"):
         gwas_linear_regression(
             ds, covariates=[], dosage="dosage", trait="trait_0", add_intercept=False
         )