Skip to content

Fix dim lengths created from coords or ConstantData #5751 #5763

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 45 additions & 12 deletions pymc/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import warnings

from copy import copy
from typing import Any, Dict, List, Optional, Sequence, Union, cast
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union, cast

import aesara
import aesara.tensor as at
Expand Down Expand Up @@ -466,9 +466,15 @@ def align_minibatches(batches=None):
rng.seed()


def determine_coords(model, value, dims: Optional[Sequence[str]] = None) -> Dict[str, Sequence]:
def determine_coords(
model,
value,
dims: Optional[Sequence[Optional[str]]] = None,
coords: Optional[Dict[str, Sequence]] = None,
) -> Tuple[Dict[str, Sequence], Sequence[Optional[str]]]:
"""Determines coordinate values from data or the model (via ``dims``)."""
coords = {}
if coords is None:
coords = {}

# If value is a df or a series, we interpret the index as coords:
if hasattr(value, "index"):
Expand Down Expand Up @@ -499,17 +505,22 @@ def determine_coords(model, value, dims: Optional[Sequence[str]] = None) -> Dict
)
for size, dim in zip(value.shape, dims):
coord = model.coords.get(dim, None)
if coord is None:
if coord is None and dim is not None:
coords[dim] = range(size)

return coords
if dims is None:
# TODO: Also determine dim names from the index
dims = [None] * np.ndim(value)

return coords, dims


def ConstantData(
name: str,
value,
*,
dims: Optional[Sequence[str]] = None,
coords: Optional[Dict[str, Sequence]] = None,
export_index_as_coords=False,
**kwargs,
) -> TensorConstant:
Expand All @@ -522,6 +533,7 @@ def ConstantData(
name,
value,
dims=dims,
coords=coords,
export_index_as_coords=export_index_as_coords,
mutable=False,
**kwargs,
Expand All @@ -534,6 +546,7 @@ def MutableData(
value,
*,
dims: Optional[Sequence[str]] = None,
coords: Optional[Dict[str, Sequence]] = None,
export_index_as_coords=False,
**kwargs,
) -> SharedVariable:
Expand All @@ -546,6 +559,7 @@ def MutableData(
name,
value,
dims=dims,
coords=coords,
export_index_as_coords=export_index_as_coords,
mutable=True,
**kwargs,
Expand All @@ -558,6 +572,7 @@ def Data(
value,
*,
dims: Optional[Sequence[str]] = None,
coords: Optional[Dict[str, Sequence]] = None,
export_index_as_coords=False,
mutable: Optional[bool] = None,
**kwargs,
Expand Down Expand Up @@ -588,9 +603,11 @@ def Data(
:ref:`arviz:quickstart`.
If this parameter is not specified, the random variables will not have dimension
names.
coords : dict, optional
Coordinate values to set for new dimensions introduced by this ``Data`` variable.
export_index_as_coords : bool, default=False
If True, the ``Data`` container will try to infer what the coordinates should be
if there is an index in ``value``.
If True, the ``Data`` container will try to infer what the coordinates
and dimension names should be if there is an index in ``value``.
mutable : bool, optional
Switches between creating a :class:`~aesara.compile.sharedvalue.SharedVariable`
(``mutable=True``) vs. creating a :class:`~aesara.tensor.TensorConstant`
Expand Down Expand Up @@ -624,6 +641,9 @@ def Data(
... model.set_data('data', data_vals)
... idatas.append(pm.sample())
"""
if coords is None:
coords = {}

if isinstance(value, list):
value = np.array(value)

Expand Down Expand Up @@ -665,15 +685,28 @@ def Data(
expected=x.ndim,
)

coords = determine_coords(model, value, dims)

# Optionally infer coords and dims from the input value.
if export_index_as_coords:
model.add_coords(coords)
elif dims:
coords, dims = determine_coords(model, value, dims)

if dims:
if not mutable:
# Use the dimension lengths from the before it was tensorified.
# These can still be tensors, but in many cases they are numeric.
xshape = np.shape(arr)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

doesn't matter much but x.type.shape should have the same info

else:
xshape = x.shape
# Register new dimension lengths
for d, dname in enumerate(dims):
if not dname in model.dim_lengths:
model.add_coord(dname, values=None, length=x.shape[d])
model.add_coord(
name=dname,
# Note: Coordinate values can't be taken from
# the value, because it could be N-dimensional.
values=coords.get(dname, None),
mutable=mutable,
length=xshape[d],
)

model.add_random_variable(x, dims=dims)

Expand Down
56 changes: 27 additions & 29 deletions pymc/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
from pymc.distributions import joint_logpt
from pymc.distributions.logprob import _get_scaling
from pymc.distributions.transforms import _default_transform
from pymc.exceptions import ImputationWarning, SamplingError, ShapeError, ShapeWarning
from pymc.exceptions import ImputationWarning, SamplingError, ShapeError
from pymc.initial_point import make_initial_point_fn
from pymc.math import flatten_list
from pymc.util import (
Expand Down Expand Up @@ -1071,8 +1071,9 @@ def add_coord(
self,
name: str,
values: Optional[Sequence] = None,
mutable: bool = False,
*,
length: Optional[Variable] = None,
length: Optional[Union[int, Variable]] = None,
):
"""Registers a dimension coordinate with the model.

Expand All @@ -1084,9 +1085,12 @@ def add_coord(
values : optional, array-like
Coordinate values or ``None`` (for auto-numbering).
If ``None`` is passed, a ``length`` must be specified.
mutable : bool
Whether the created dimension should be resizable.
Default is False.
length : optional, scalar
A symbolic scalar of the dimensions length.
Defaults to ``aesara.shared(len(values))``.
A scalar of the dimensions length.
Defaults to ``aesara.tensor.constant(len(values))``.
"""
if name in {"draw", "chain", "__sample__"}:
raise ValueError(
Expand All @@ -1097,7 +1101,9 @@ def add_coord(
raise ValueError(
f"Either `values` or `length` must be specified for the '{name}' dimension."
)
if length is not None and not isinstance(length, Variable):
if isinstance(length, int):
length = at.constant(length)
elif length is not None and not isinstance(length, Variable):
raise ValueError(
f"The `length` passed for the '{name}' coord must be an Aesara Variable or None."
)
Expand All @@ -1109,14 +1115,17 @@ def add_coord(
if not np.array_equal(values, self.coords[name]):
raise ValueError(f"Duplicate and incompatible coordinate: {name}.")
else:
if mutable:
self._dim_lengths[name] = length or aesara.shared(len(values))
else:
self._dim_lengths[name] = length or aesara.tensor.constant(len(values))
self._coords[name] = values
self._dim_lengths[name] = length or aesara.shared(len(values))

def add_coords(
self,
coords: Dict[str, Optional[Sequence]],
*,
lengths: Optional[Dict[str, Union[Variable, None]]] = None,
lengths: Optional[Dict[str, Optional[Union[int, Variable]]]] = None,
):
"""Vectorized version of ``Model.add_coord``."""
if coords is None:
Expand Down Expand Up @@ -1180,24 +1189,20 @@ def set_data(
# NOTE: If there are multiple pm.MutableData containers sharing this dim, but the user only
# changes the values for one of them, they will run into shape problems nonetheless.
if length_changed:
if original_coords is not None:
if new_coords is None:
raise ValueError(
f"The '{name}' variable already had {len(original_coords)} coord values defined for "
f"its {dname} dimension. With the new values this dimension changes to length "
f"{new_length}, so new coord values for the {dname} dimension are required."
)
if isinstance(length_tensor, TensorConstant):
raise ShapeError(
f"Resizing dimension '{dname}' is impossible, because "
f"a 'TensorConstant' stores its length. To be able "
f"to change the dimension length, 'fixed' in "
f"'model.add_coord' must be set to `False`."
)
if length_tensor.owner is None:
# This is the case if the dimension was initialized
# from custom coords, but dimension length was not
# stored in TensorConstant e.g by 'fixed' set to False

warnings.warn(
f"You're changing the shape of a variable "
f"in the '{dname}' dimension which was initialized "
f"from coords. Make sure to update the corresponding "
f"coords, otherwise you'll get shape issues.",
ShapeWarning,
"a 'TensorConstant' stores its length. To be able "
"to change the dimension length, pass `mutable=True` when "
"registering the dimension via `model.add_coord`, "
"or define it via a `pm.MutableData` variable."
)
else:
length_belongs_to = length_tensor.owner.inputs[0].owner.inputs[0]
Expand All @@ -1210,13 +1215,6 @@ def set_data(
actual=new_length,
expected=old_length,
)
if original_coords is not None:
if new_coords is None:
raise ValueError(
f"The '{name}' variable already had {len(original_coords)} coord values defined for "
f"its {dname} dimension. With the new values this dimension changes to length "
f"{new_length}, so new coord values for the {dname} dimension are required."
)
if isinstance(length_tensor, ScalarSharedVariable):
# Updating the shared variable resizes dependent nodes that use this dimension for their `size`.
length_tensor.set_value(new_length)
Expand Down
28 changes: 23 additions & 5 deletions pymc/tests/test_data_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

from aesara import shared
from aesara.compile.sharedvalue import SharedVariable
from aesara.tensor.sharedvar import ScalarSharedVariable
from aesara.tensor import TensorConstant
from aesara.tensor.var import TensorVariable

import pymc as pm
Expand Down Expand Up @@ -315,29 +315,47 @@ def test_explicit_coords(self):
}
# pass coordinates explicitly, use numpy array in Data container
with pm.Model(coords=coords) as pmodel:
# Dims created from coords are constant by default
assert isinstance(pmodel.dim_lengths["rows"], TensorConstant)
assert isinstance(pmodel.dim_lengths["columns"], TensorConstant)
pm.MutableData("observations", data, dims=("rows", "columns"))
# new data with same shape
# new data with same (!) shape
pm.set_data({"observations": data + 1})
# new data with same shape and coords
# new data with same (!) shape and coords
pm.set_data({"observations": data}, coords=coords)
assert "rows" in pmodel.coords
assert pmodel.coords["rows"] == ("R1", "R2", "R3", "R4", "R5")
assert "rows" in pmodel.dim_lengths
assert isinstance(pmodel.dim_lengths["rows"], ScalarSharedVariable)
assert pmodel.dim_lengths["rows"].eval() == 5
assert "columns" in pmodel.coords
assert pmodel.coords["columns"] == ("C1", "C2", "C3", "C4", "C5", "C6", "C7")
assert pmodel.RV_dims == {"observations": ("rows", "columns")}
assert "columns" in pmodel.dim_lengths
assert isinstance(pmodel.dim_lengths["columns"], ScalarSharedVariable)
assert pmodel.dim_lengths["columns"].eval() == 7

def test_set_coords_through_pmdata(self):
with pm.Model() as pmodel:
pm.ConstantData(
"population", [100, 200], dims="city", coords={"city": ["Tinyvil", "Minitown"]}
)
pm.MutableData(
"temperature",
[[15, 20, 22, 17], [18, 22, 21, 12]],
dims=("city", "season"),
coords={"season": ["winter", "spring", "summer", "fall"]},
)
assert "city" in pmodel.coords
assert "season" in pmodel.coords
assert pmodel.coords["city"] == ("Tinyvil", "Minitown")
assert pmodel.coords["season"] == ("winter", "spring", "summer", "fall")

def test_symbolic_coords(self):
"""
In v4 dimensions can be created without passing coordinate values.
Their lengths are then automatically linked to the corresponding Tensor dimension.
"""
with pm.Model() as pmodel:
# Dims created from MutableData are TensorVariables linked to the SharedVariable.shape
intensity = pm.MutableData("intensity", np.ones((2, 3)), dims=("row", "column"))
assert "row" in pmodel.dim_lengths
assert "column" in pmodel.dim_lengths
Expand Down
63 changes: 63 additions & 0 deletions pymc/tests/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@
import scipy.sparse as sps
import scipy.stats as st

from aesara.tensor import TensorVariable
from aesara.tensor.random.op import RandomVariable
from aesara.tensor.sharedvar import ScalarSharedVariable
from aesara.tensor.var import TensorConstant

import pymc as pm
Expand Down Expand Up @@ -702,6 +704,67 @@ def test_nested_model_coords():
assert set(m2.RV_dims) < set(m1.RV_dims)


def test_shapeerror_from_resize_immutable_dims():
"""
Trying to resize an immutable dimension should raise a ShapeError.
Even if the variable being updated is a SharedVariable and has other
dimensions that are mutable.
"""
with pm.Model() as pmodel:
a = pm.Normal("a", mu=[1, 2, 3], dims="fixed")

m = pm.MutableData("m", [[1, 2, 3]], dims=("one", "fixed"))

# This is fine because the "fixed" dim is not resized
pm.set_data({"m": [[1, 2, 3], [3, 4, 5]]})

with pytest.raises(ShapeError, match="was initialized from 'a'"):
# Can't work because the "fixed" dimension is linked to a constant shape:
# Note that the new data tries to change both dimensions
with pmodel:
pm.set_data({"m": [[1, 2], [3, 4]]})


def test_valueerror_from_resize_without_coords_update():
"""
Resizing a mutable dimension that had coords,
without passing new coords raises a ValueError.
"""
with pm.Model() as pmodel:
pmodel.add_coord("shared", [1, 2, 3], mutable=True)
pm.MutableData("m", [1, 2, 3], dims=("shared"))
with pytest.raises(ValueError, match="'m' variable already had 3"):
# tries to resize m but without passing coords so raise ValueError
pm.set_data({"m": [1, 2, 3, 4]})


def test_coords_and_constantdata_create_immutable_dims():
"""
When created from `pm.Model(coords=...)` or `pm.ConstantData`
a dimension should be resizable.
"""
with pm.Model(coords={"group": ["A", "B"]}) as m:
x = pm.ConstantData("x", [0], dims="feature")
y = pm.Normal("y", x, 1, dims=("group", "feature"))
assert isinstance(m._dim_lengths["feature"], TensorConstant)
assert isinstance(m._dim_lengths["group"], TensorConstant)
assert x.eval().shape == (1,)
assert y.eval().shape == (2, 1)


def test_add_coord_mutable_kwarg():
"""
Checks resulting tensor type depending on mutable kwarg in add_coord.
"""
with pm.Model() as m:
m.add_coord("fixed", values=[1], mutable=False)
m.add_coord("mutable1", values=[1, 2], mutable=True)
assert isinstance(m._dim_lengths["fixed"], TensorConstant)
assert isinstance(m._dim_lengths["mutable1"], ScalarSharedVariable)
pm.MutableData("mdata", np.ones((1, 2, 3)), dims=("fixed", "mutable1", "mutable2"))
assert isinstance(m._dim_lengths["mutable2"], TensorVariable)


@pytest.mark.parametrize("jacobian", [True, False])
def test_model_logp(jacobian):
with pm.Model() as m:
Expand Down