Skip to content

Commit c5ffa03

Browse files
authored
feat: Implement date_partition_column for SparkSource (feast-dev#4844)
1 parent b97da6c commit c5ffa03

File tree

3 files changed

+137
-1
lines changed

3 files changed

+137
-1
lines changed

sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,8 @@ def pull_latest_from_table_or_query(
9999
fields_as_string = ", ".join(fields_with_aliases)
100100
aliases_as_string = ", ".join(aliases)
101101

102+
date_partition_column = data_source.date_partition_column
103+
102104
start_date_str = _format_datetime(start_date)
103105
end_date_str = _format_datetime(end_date)
104106
query = f"""
@@ -109,7 +111,7 @@ def pull_latest_from_table_or_query(
109111
SELECT {fields_as_string},
110112
ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS feast_row_
111113
FROM {from_expression} t1
112-
WHERE {timestamp_field} BETWEEN TIMESTAMP('{start_date_str}') AND TIMESTAMP('{end_date_str}')
114+
WHERE {timestamp_field} BETWEEN TIMESTAMP('{start_date_str}') AND TIMESTAMP('{end_date_str}'){" AND "+date_partition_column+" >= '"+start_date.strftime('%Y-%m-%d')+"' AND "+date_partition_column+" <= '"+end_date.strftime('%Y-%m-%d')+"' " if date_partition_column != "" and date_partition_column is not None else ''}
113115
) t2
114116
WHERE feast_row_ = 1
115117
"""
@@ -641,8 +643,15 @@ def _cast_data_frame(
641643
{% endfor %}
642644
FROM {{ featureview.table_subquery }}
643645
WHERE {{ featureview.timestamp_field }} <= '{{ featureview.max_event_timestamp }}'
646+
{% if featureview.date_partition_column != "" and featureview.date_partition_column is not none %}
647+
AND {{ featureview.date_partition_column }} <= '{{ featureview.max_event_timestamp[:10] }}'
648+
{% endif %}
649+
644650
{% if featureview.ttl == 0 %}{% else %}
645651
AND {{ featureview.timestamp_field }} >= '{{ featureview.min_event_timestamp }}'
652+
{% if featureview.date_partition_column != "" and featureview.date_partition_column is not none %}
653+
AND {{ featureview.date_partition_column }} >= '{{ featureview.min_event_timestamp[:10] }}'
654+
{% endif %}
646655
{% endif %}
647656
),
648657

sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark_source.py

+6
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ def __init__(
4545
tags: Optional[Dict[str, str]] = None,
4646
owner: Optional[str] = "",
4747
timestamp_field: Optional[str] = None,
48+
date_partition_column: Optional[str] = None,
4849
):
4950
"""Creates a SparkSource object.
5051
@@ -64,6 +65,8 @@ def __init__(
6465
maintainer.
6566
timestamp_field: Event timestamp field used for point-in-time joins of
6667
feature values.
68+
date_partition_column: The column to partition the data on for faster
69+
retrieval. This is useful for large tables and will limit the number ofi
6770
"""
6871
# If no name, use the table as the default name.
6972
if name is None and table is None:
@@ -77,6 +80,7 @@ def __init__(
7780
created_timestamp_column=created_timestamp_column,
7881
field_mapping=field_mapping,
7982
description=description,
83+
date_partition_column=date_partition_column,
8084
tags=tags,
8185
owner=owner,
8286
)
@@ -135,6 +139,7 @@ def from_proto(data_source: DataSourceProto) -> Any:
135139
query=spark_options.query,
136140
path=spark_options.path,
137141
file_format=spark_options.file_format,
142+
date_partition_column=data_source.date_partition_column,
138143
timestamp_field=data_source.timestamp_field,
139144
created_timestamp_column=data_source.created_timestamp_column,
140145
description=data_source.description,
@@ -148,6 +153,7 @@ def to_proto(self) -> DataSourceProto:
148153
type=DataSourceProto.BATCH_SPARK,
149154
data_source_class_type="feast.infra.offline_stores.contrib.spark_offline_store.spark_source.SparkSource",
150155
field_mapping=self.field_mapping,
156+
date_partition_column=self.date_partition_column,
151157
spark_options=self.spark_options.to_proto(),
152158
description=self.description,
153159
tags=self.tags,

sdk/python/tests/unit/infra/offline_stores/contrib/spark_offline_store/test_spark.py

+121
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,68 @@ def test_pull_latest_from_table_with_nested_timestamp_or_query(mock_get_spark_se
7171
assert retrieval_job.query.strip() == expected_query.strip()
7272

7373

74+
@patch(
75+
"feast.infra.offline_stores.contrib.spark_offline_store.spark.get_spark_session_or_start_new_with_repoconfig"
76+
)
77+
def test_pull_latest_from_table_with_nested_timestamp_or_query_and_date_partition_column_set(
78+
mock_get_spark_session,
79+
):
80+
mock_spark_session = MagicMock()
81+
mock_get_spark_session.return_value = mock_spark_session
82+
83+
test_repo_config = RepoConfig(
84+
project="test_project",
85+
registry="test_registry",
86+
provider="local",
87+
offline_store=SparkOfflineStoreConfig(type="spark"),
88+
)
89+
90+
test_data_source = SparkSource(
91+
name="test_nested_batch_source",
92+
description="test_nested_batch_source",
93+
table="offline_store_database_name.offline_store_table_name",
94+
timestamp_field="nested_timestamp",
95+
field_mapping={
96+
"event_header.event_published_datetime_utc": "nested_timestamp",
97+
},
98+
date_partition_column="effective_date",
99+
)
100+
101+
# Define the parameters for the method
102+
join_key_columns = ["key1", "key2"]
103+
feature_name_columns = ["feature1", "feature2"]
104+
timestamp_field = "event_header.event_published_datetime_utc"
105+
created_timestamp_column = "created_timestamp"
106+
start_date = datetime(2021, 1, 1)
107+
end_date = datetime(2021, 1, 2)
108+
109+
# Call the method
110+
retrieval_job = SparkOfflineStore.pull_latest_from_table_or_query(
111+
config=test_repo_config,
112+
data_source=test_data_source,
113+
join_key_columns=join_key_columns,
114+
feature_name_columns=feature_name_columns,
115+
timestamp_field=timestamp_field,
116+
created_timestamp_column=created_timestamp_column,
117+
start_date=start_date,
118+
end_date=end_date,
119+
)
120+
121+
expected_query = """SELECT
122+
key1, key2, feature1, feature2, nested_timestamp, created_timestamp
123+
124+
FROM (
125+
SELECT key1, key2, feature1, feature2, event_header.event_published_datetime_utc AS nested_timestamp, created_timestamp,
126+
ROW_NUMBER() OVER(PARTITION BY key1, key2 ORDER BY event_header.event_published_datetime_utc DESC, created_timestamp DESC) AS feast_row_
127+
FROM `offline_store_database_name`.`offline_store_table_name` t1
128+
WHERE event_header.event_published_datetime_utc BETWEEN TIMESTAMP('2021-01-01 00:00:00.000000') AND TIMESTAMP('2021-01-02 00:00:00.000000') AND effective_date >= '2021-01-01' AND effective_date <= '2021-01-02'
129+
) t2
130+
WHERE feast_row_ = 1""" # noqa: W293, W291
131+
132+
assert isinstance(retrieval_job, RetrievalJob)
133+
assert retrieval_job.query.strip() == expected_query.strip()
134+
135+
74136
@patch(
75137
"feast.infra.offline_stores.contrib.spark_offline_store.spark.get_spark_session_or_start_new_with_repoconfig"
76138
)
@@ -127,3 +189,62 @@ def test_pull_latest_from_table_without_nested_timestamp_or_query(
127189

128190
assert isinstance(retrieval_job, RetrievalJob)
129191
assert retrieval_job.query.strip() == expected_query.strip()
192+
193+
194+
@patch(
195+
"feast.infra.offline_stores.contrib.spark_offline_store.spark.get_spark_session_or_start_new_with_repoconfig"
196+
)
197+
def test_pull_latest_from_table_without_nested_timestamp_or_query_and_date_partition_column_set(
198+
mock_get_spark_session,
199+
):
200+
mock_spark_session = MagicMock()
201+
mock_get_spark_session.return_value = mock_spark_session
202+
203+
test_repo_config = RepoConfig(
204+
project="test_project",
205+
registry="test_registry",
206+
provider="local",
207+
offline_store=SparkOfflineStoreConfig(type="spark"),
208+
)
209+
210+
test_data_source = SparkSource(
211+
name="test_batch_source",
212+
description="test_nested_batch_source",
213+
table="offline_store_database_name.offline_store_table_name",
214+
timestamp_field="event_published_datetime_utc",
215+
date_partition_column="effective_date",
216+
)
217+
218+
# Define the parameters for the method
219+
join_key_columns = ["key1", "key2"]
220+
feature_name_columns = ["feature1", "feature2"]
221+
timestamp_field = "event_published_datetime_utc"
222+
created_timestamp_column = "created_timestamp"
223+
start_date = datetime(2021, 1, 1)
224+
end_date = datetime(2021, 1, 2)
225+
226+
# Call the method
227+
retrieval_job = SparkOfflineStore.pull_latest_from_table_or_query(
228+
config=test_repo_config,
229+
data_source=test_data_source,
230+
join_key_columns=join_key_columns,
231+
feature_name_columns=feature_name_columns,
232+
timestamp_field=timestamp_field,
233+
created_timestamp_column=created_timestamp_column,
234+
start_date=start_date,
235+
end_date=end_date,
236+
)
237+
238+
expected_query = """SELECT
239+
key1, key2, feature1, feature2, event_published_datetime_utc, created_timestamp
240+
241+
FROM (
242+
SELECT key1, key2, feature1, feature2, event_published_datetime_utc, created_timestamp,
243+
ROW_NUMBER() OVER(PARTITION BY key1, key2 ORDER BY event_published_datetime_utc DESC, created_timestamp DESC) AS feast_row_
244+
FROM `offline_store_database_name`.`offline_store_table_name` t1
245+
WHERE event_published_datetime_utc BETWEEN TIMESTAMP('2021-01-01 00:00:00.000000') AND TIMESTAMP('2021-01-02 00:00:00.000000') AND effective_date >= '2021-01-01' AND effective_date <= '2021-01-02'
246+
) t2
247+
WHERE feast_row_ = 1""" # noqa: W293, W291
248+
249+
assert isinstance(retrieval_job, RetrievalJob)
250+
assert retrieval_job.query.strip() == expected_query.strip()

0 commit comments

Comments
 (0)