Skip to content

Commit 8472046

Browse files
committedJun 7, 2024
training pipeline
1 parent f43f62a commit 8472046

7 files changed

+4008
-1
lines changed
 

‎.gitignore

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
data/
2+
runs/
3+
4+
*.pkl
5+

‎EDA/eda.py

+386
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,386 @@
1+
# %%
2+
import random
3+
4+
import cartopy.crs as crs
5+
import matplotlib.pyplot as plt
6+
import numpy as np
7+
import pandas as pd
8+
9+
# %%
10+
df_train = pd.read_parquet("../data/trainset_new.parquet")
11+
df_test = pd.read_parquet("../data/testset_forecast.parquet")
12+
13+
df_loc_train = df_train[["latitude_rounded", "longitude_rounded"]].drop_duplicates()
14+
print(
15+
f"train latitude range: {df_loc_train['latitude_rounded'].min()},"
16+
f" {df_loc_train['latitude_rounded'].max()}"
17+
)
18+
print(
19+
f"train longitude range: {df_loc_train['longitude_rounded'].min()},"
20+
f" {df_loc_train['longitude_rounded'].max()}"
21+
)
22+
23+
df_loc_test = df_test[["latitude_rounded", "longitude_rounded"]].drop_duplicates()
24+
print(
25+
f"test latitude range: {df_loc_test['latitude_rounded'].min()},"
26+
f" {df_loc_test['latitude_rounded'].max()}"
27+
)
28+
print(
29+
f"test longitude range: {df_loc_test['longitude_rounded'].min()},"
30+
f" {df_loc_test['longitude_rounded'].max()}"
31+
)
32+
33+
start_date = df_train["date"].min()
34+
end_date = df_train["date"].max()
35+
# %%
36+
df_train.columns
37+
# %%
38+
# plot random station
39+
train_ids = df_train["ss_id"].unique()
40+
test_ids = df_test["ss_id"].unique()
41+
idx = random.choice(train_ids)
42+
df_train[df_train["ss_id"] == idx][["average_power_kw", "date"]].plot(
43+
x="date", y="average_power_kw", title=f"{idx}"
44+
)
45+
# %%
46+
# from visual inspection exclude the following panels
47+
# 17693: inconsistency in time series
48+
# 6611, 7750, 6927, 26865: only 0 entries
49+
outliers_train = []
50+
outliers_test = []
51+
save = False
52+
outliers_train.extend([17693, 6611, 7750, 6927, 26865])
53+
54+
55+
# %%
56+
class CleanOutliers:
57+
"""calculate different statistics for the dataset and outliers to be removed"""
58+
59+
def __init__(self, train: bool, outliers: list):
60+
"""initialize arguments
61+
62+
Args:
63+
train (bool): if True conditions for training set are used,
64+
else conditions for test set are used
65+
outliers (list): list of outliers to start with
66+
"""
67+
self.train = train
68+
self.outliers = outliers
69+
70+
def apply_filter(self, df: pd.DataFrame, filter: dict) -> pd.DataFrame:
71+
"""applies defined filter to the dataframe, keeps only the data defined in the filter
72+
73+
Returns:
74+
pandas dataframe: dataframe after applying the filter
75+
"""
76+
print(f"data length before filtering: {len(df)}")
77+
78+
for key, value in filter.items():
79+
df = df[df[key] == value]
80+
self.ids = df["ss_id"].unique()
81+
self.df_station = pd.DataFrame(data={"ss_id": self.ids})
82+
print(f"remaining data length after filtering: {len(df)}")
83+
return df
84+
85+
def get_night_energy(self, df: pd.DataFrame, criterion: dict) -> pd.DataFrame:
86+
"""get different statistics for the energy at night
87+
88+
Args:
89+
df (pd.DataFrame): dataframe containing the data
90+
criterion (dict): criterion to remove outliers
91+
92+
Returns:
93+
pd.DataFrame: dataframe with statistics for each panel
94+
"""
95+
df_night_sum = df.groupby("ss_id").apply(lambda x: self._night_sum(x)).reset_index()
96+
df_night_sum.rename(columns={0: "total_power_at_night_kw"}, inplace=True)
97+
df_night_mean = df.groupby("ss_id").apply(lambda x: self._night_mean(x)).reset_index()
98+
df_night_mean.rename(columns={0: "mean_power_at_night_kw"}, inplace=True)
99+
df_night_median = df.groupby("ss_id").apply(lambda x: self._night_median(x)).reset_index()
100+
df_night_median.rename(columns={0: "median_power_at_night_kw"}, inplace=True)
101+
self.df_station = df_night_sum.merge(df_night_mean, on="ss_id")
102+
self.df_station = self.df_station.merge(df_night_median, on="ss_id")
103+
104+
idx_outliers = []
105+
for key, value in criterion.items():
106+
df_out = self.df_station[self.df_station[key] >= value]
107+
idx_outliers.extend(list(df_out["ss_id"].values))
108+
self.outliers.extend(idx_outliers)
109+
110+
return self.df_station, idx_outliers
111+
112+
def get_statistics(self, df: pd.DataFrame) -> pd.DataFrame:
113+
"""calculate some statistics for individual panels
114+
115+
Args:
116+
df (pandas dataframe): complete dataframe with all panels
117+
118+
Returns:
119+
pandas datafame: dataframe with statistics for each panel in ids
120+
"""
121+
means = df.groupby("ss_id")["average_power_kw"].mean()
122+
means_winter = df[df["date"].dt.month.isin([12, 1, 2])]["average_power_kw"].mean()
123+
means_spring = df[df["date"].dt.month.isin([3, 4, 5])]["average_power_kw"].mean()
124+
means_summer = df[df["date"].dt.month.isin([6, 7, 8])]["average_power_kw"].mean()
125+
means_autumn = df[df["date"].dt.month.isin([9, 10, 11])]["average_power_kw"].mean()
126+
medians = df.groupby("ss_id")["average_power_kw"].median()
127+
stds = df.groupby("ss_id")["average_power_kw"].std()
128+
lats = df.groupby("ss_id")["latitude_rounded"].mean()
129+
lons = df.groupby("ss_id")["longitude_rounded"].mean()
130+
131+
s_date = df.groupby("ss_id")["date"].min()
132+
e_date = df.groupby("ss_id")["date"].max()
133+
134+
data = {
135+
"mean": means,
136+
"median": medians,
137+
"std": stds,
138+
"lat": lats,
139+
"lon": lons,
140+
"start_date": s_date,
141+
"end_date": e_date,
142+
"mean_winter": means_winter,
143+
"mean_spring": means_spring,
144+
"mean_summer": means_summer,
145+
"mean _autumn": means_autumn,
146+
}
147+
df_statistics = pd.DataFrame(data=data)
148+
self.df_station = df_statistics.merge(self.df_station, on="ss_id")
149+
return self.df_station
150+
151+
def get_data_availability(self, df: pd.DataFrame) -> pd.DataFrame:
152+
"""get information about the data availability
153+
154+
Args:
155+
df (pd.DataFrame): complete dataframe with all panels
156+
157+
Returns:
158+
pd.DataFrame: dataframe with info for data availability for each panel in ids
159+
"""
160+
data = {
161+
"ss_id": [],
162+
"total_length": [],
163+
"data_length": [],
164+
"time_span": [],
165+
"missing_values_total": [],
166+
"missing_values_span": [],
167+
}
168+
for id_ in self.ids:
169+
# missing values for entire time frame
170+
df_tmp = df[df["ss_id"] == id_]
171+
if df_tmp.empty:
172+
self.outliers.extend([id_])
173+
else:
174+
s_date = df_tmp["date"].min()
175+
e_date = df_tmp["date"].max()
176+
df_span = pd.DataFrame(
177+
pd.date_range(start=s_date, end=e_date, freq="h"), columns=["date"]
178+
)
179+
# missing values over available time
180+
df_time = pd.DataFrame(
181+
pd.date_range(start="2018-01-01", end="2021-11-09", freq="h"), columns=["date"]
182+
)
183+
# total nr of samples available
184+
da = len(df_time[df_time.date.isin(df_tmp.date)])
185+
data["ss_id"].append(id_)
186+
data["total_length"].append(len(df_time))
187+
data["data_length"].append(da)
188+
data["time_span"].append(len(df_span))
189+
data["missing_values_total"].append(len(df_time) - da)
190+
data["missing_values_span"].append(len(df_span) - da)
191+
df_data = pd.DataFrame(data=data)
192+
self.df_station = df_data.merge(self.df_station, on="ss_id")
193+
return self.df_station
194+
195+
def remove_zero_sequences(self, df: pd.DataFrame, criterion: dict) -> pd.DataFrame:
196+
"""remove sequences of zeros in target data
197+
198+
Args:
199+
df (pd.DataFrame): complete dataframe with all panels
200+
201+
Returns:
202+
pd.DataFrame: complete dataframe with all panels
203+
with samples removed that are a sequence of zeros
204+
"""
205+
n = criterion["zero_sequence_length"]
206+
is_zero = df[target] == 0
207+
# Use cumsum to create groups of consecutive zeros
208+
zero_groups = is_zero.ne(is_zero.shift()).cumsum()
209+
210+
# Filter groups where target is zero and count each group's size
211+
filtered_groups = df[is_zero].groupby(zero_groups).filter(lambda x: len(x) < n)
212+
valid_indices = df[is_zero].loc[filtered_groups.index].index
213+
214+
# Combine indices of non-zero rows with valid zero sequence rows
215+
non_zero_indices = df[~is_zero].index
216+
all_valid_indices = non_zero_indices.union(valid_indices).sort_values()
217+
# Return the DataFrame with only the valid indices
218+
return df.loc[all_valid_indices]
219+
220+
def remove_outliers(self, df_statistics: pd.DataFrame, criterion: dict) -> pd.DataFrame:
221+
"""removes outliers depending on "criterion" from the training data
222+
223+
Args:
224+
df_statistics (pd.DataFrame): dataframe with statistics for each panel in ids
225+
criterion (dict): criterion to remove outliers
226+
227+
Returns:
228+
pd.DataFrame: dataframe with statistics for each panel in ids with outliers removed
229+
lsit: list of outliers ids
230+
"""
231+
if self.train:
232+
# remove time series that have few data
233+
outlier_idx = df_statistics[df_statistics["time_span"] < criterion["min_time_span"]][
234+
"ss_id"
235+
].values
236+
self.outliers.extend(outlier_idx)
237+
# remove data with more than p% missing values of available time span
238+
p = criterion["percentage_missing"]
239+
outlier_idx = df_statistics[
240+
df_statistics["missing_values_span"] / df_statistics["time_span"] > p
241+
]["ss_id"].values
242+
self.outliers.extend(outlier_idx)
243+
# remove panels with median above p-percentile
244+
p = np.percentile(df_statistics["median"], q=[criterion["percentile"]])[0]
245+
outlier_idx = df_statistics[df_statistics["median"] > p]["ss_id"].values
246+
self.outliers.extend(outlier_idx)
247+
self.outliers = list(set(self.outliers))
248+
249+
df_statistics = df_statistics[~df_statistics["ss_id"].isin(self.outliers)]
250+
return df_statistics, self.outliers
251+
252+
def __call__(
253+
self, df: pd.DataFrame, night_criterion: dict, statistics_criterion: dict
254+
) -> pd.DataFrame:
255+
"""removes outliers from dataframe and returns statistics of the remaining panels
256+
257+
Args:
258+
df (pd.DataFrame): complete dataframe with all panels
259+
260+
Returns:
261+
pd.DataFrame: dataframe with statistics for each panel for all ids
262+
pd.DataFrame: dataframe with statistics for each panel in ids with outliers removed
263+
lsit: list of outliers ids
264+
"""
265+
clean_outliers = CleanOutliers(train=self.train, outliers=self.outliers)
266+
df = clean_outliers.apply_filter(df, filter)
267+
df = clean_outliers.remove_zero_sequences(df, statistics_criterion)
268+
df_statistics, _ = clean_outliers.get_night_energy(df, night_criterion)
269+
df_statistics = clean_outliers.get_statistics(df)
270+
df_statistics = clean_outliers.get_data_availability(df)
271+
df_statistics_clean, outliers = clean_outliers.remove_outliers(
272+
df_statistics, statistics_criterion
273+
)
274+
return df_statistics, df_statistics_clean, outliers
275+
276+
def _night_sum(self, df_loc):
277+
"""get total sum of energy at night
278+
279+
Returns:
280+
_type_: _description_
281+
"""
282+
night_sum = df_loc[df_loc["is_day"] == 0][target].sum()
283+
return night_sum
284+
285+
def _night_mean(self, df_loc):
286+
"""get mean of energy at night
287+
288+
Returns:
289+
_type_: _description_
290+
"""
291+
night_mean = df_loc[df_loc["is_day"] == 0][target].mean()
292+
return night_mean
293+
294+
def _night_median(self, df_loc):
295+
"""get median of energy at night
296+
297+
Returns:
298+
_type_: _description_
299+
"""
300+
night_median = df_loc[df_loc["is_day"] == 0][target].median()
301+
return night_median
302+
303+
304+
# %%
305+
target = "average_power_kw"
306+
# dictionaries to filter data
307+
filter = {"is_day": 1}
308+
night_criterion = {"total_power_at_night_kw": 100}
309+
statistics_criterion = {
310+
"min_time_span": 720,
311+
"percentage_missing": 0.9,
312+
"percentile": 90,
313+
"zero_sequence_length": 30,
314+
}
315+
316+
clean_outliers = CleanOutliers(train=True, outliers=outliers_train)
317+
df_statistics_train, df_statistics_train_clean, outliers_train = clean_outliers(
318+
df_train, night_criterion, statistics_criterion
319+
)
320+
print(len(outliers_train))
321+
322+
clean_outliers = CleanOutliers(train=False, outliers=outliers_test)
323+
df_statistics_test, df_statistics_test_clean, outliers_test = clean_outliers(
324+
df_test, night_criterion, statistics_criterion
325+
)
326+
print(len(outliers_test))
327+
# %%
328+
# boxplots
329+
fig, ax = plt.subplots(1, 3, figsize=(10, 5))
330+
df_statistics_train.boxplot(column=["mean", "median", "std", "mean_summer"], ax=ax[0])
331+
ax[0].set_title("train")
332+
ax[0].set_ylim(0, 1)
333+
df_statistics_train_clean.boxplot(column=["mean", "median", "std", "mean_summer"], ax=ax[1])
334+
ax[1].set_title("train clean")
335+
ax[1].set_ylim(0, 1)
336+
ax[2] = df_statistics_test.boxplot(column=["mean", "median", "std", "mean_summer"], ax=ax[2])
337+
ax[2].set_title("test")
338+
ax[2].set_ylim(0, 1)
339+
if save:
340+
plt.savefig("boxplot_only_day.png")
341+
# %%
342+
# spatial plots of statistical variables
343+
var = "median"
344+
df1 = df_statistics_train_clean
345+
df2 = df_statistics_test
346+
x1 = df1.lon
347+
y1 = df1.lat
348+
data1 = df1[var]
349+
title1 = f"train - {var} ({len(df1)})"
350+
x2 = df2.lon
351+
y2 = df2.lat
352+
data2 = df2[var]
353+
title2 = f"test - {var} ({len(df2)})"
354+
vmin = 0
355+
vmax = np.max([data1.max(), data2.max()])
356+
357+
fig = plt.figure(figsize=(15, 6))
358+
cm = plt.colormaps["RdYlBu"]
359+
ax = fig.add_subplot(1, 2, 1, projection=crs.PlateCarree())
360+
ax.coastlines()
361+
gl = ax.gridlines(draw_labels=True)
362+
gl.top_labels = False
363+
gl.left_labels = False
364+
lat1, lon1, lat2, lon2 = 50.0, -5.7, 59.0, 1.8
365+
ax.set_extent([lon1, lon2, lat1, lat2], crs=crs.PlateCarree())
366+
sc = plt.scatter(
367+
x=x1, y=y1, c=data1, vmin=vmin, vmax=vmax, cmap=cm, s=2, alpha=0.8, transform=crs.PlateCarree()
368+
)
369+
plt.colorbar(sc)
370+
plt.title(title1)
371+
ax = fig.add_subplot(1, 2, 2, projection=crs.PlateCarree())
372+
ax.coastlines()
373+
gl = ax.gridlines(draw_labels=True)
374+
gl.top_labels = False
375+
gl.left_labels = False
376+
lat1, lon1, lat2, lon2 = 50.0, -5.7, 59.0, 1.8
377+
ax.set_extent([lon1, lon2, lat1, lat2], crs=crs.PlateCarree())
378+
sc = plt.scatter(
379+
x=x2, y=y2, c=data2, vmin=vmin, vmax=vmax, cmap=cm, s=2, alpha=0.8, transform=crs.PlateCarree()
380+
)
381+
plt.colorbar(sc)
382+
plt.title(title2)
383+
if save:
384+
plt.savefig(f"{var}_only_day.png")
385+
386+
# %%

‎README.md

+64-1
Original file line numberDiff line numberDiff line change
@@ -1 +1,64 @@
1-
# solar-training-pipeline
1+
# solar-training-prediction
2+
3+
This repository contains the tools needed to train a model for solar energy prediction. This work was a collaboration with [Open Collaboration Foundation (OCF)](https://github.com/openclimatefix/Open-Source-Quartz-Solar-Forecast/tree/main).
4+
5+
The model was trained using the [ml-garden](https://github.com/tryolabs/ml-garden) library which was developed during this project.
6+
7+
# Data
8+
9+
The training data was downloaded from [open-meteo](https://open-meteo.com/). More specifically, hourly forecast data of the [historical weather API](https://open-meteo.com/en/docs/historical-weather-api) was used. The time period is restricted by the availabilty of the target solar enegery data of the panels and covers the time between 2018 and 2021. Additional information about the time, location and specifics about the panel are used. The weather features used are listed below, with the description given by open-meteo.
10+
11+
- Temperature at 2m (ºC): Air temperature at 2 meters above ground
12+
- Relative Humidity at 2m (%): Relative humidity at 2 meters above ground
13+
- Dewpoint at 2m (ºC): Dew point temperature at 2 meters above ground
14+
- Precipitation (rain + snow) (mm): Total precipitation (rain, showers, snow) sum of the preceding hour
15+
- Surface Pressure (hPa): Atmospheric air pressure reduced to mean sea level (msl) or pressure at surface. Typically pressure on mean sea level is used in meteorology.
16+
- Cloud Cover Total (%): Total cloud cover as an area fraction
17+
- Cloud Cover Low (%): Low level clouds and fog up to 3 km altitude
18+
- Cloud Cover Mid (%): Mid level clouds from 3 to 8 km altitude
19+
- Cloud Cover High (%): High level clouds from 8 km altitude
20+
- Wind Speed at 10m (km/h): Wind speed at 10, 80, 120 or 180 meters above ground. Wind speed on 10 meters is the standard level.
21+
- Wind Direction (10m): Wind direction at 10 meters above ground
22+
- Is day or Night: 1 if the current time step has daylight, 0 at night
23+
- Direct Solar Radiation (W/m2): Direct solar radiation as average of the preceding hour on the horizontal plane and the normal plane (perpendicular to the sun)
24+
- Diffusive Solar Radiation DHI (W/m2): Diffuse solar radiation as average of the preceding hour
25+
26+
The data was downloaded and transformed into a dataframe. The [training](https://drive.google.com/file/d/16b35aP2ML96-8B8CZ1KMjJxrUvAyS6LV/view?usp=sharing) and the [test](https://drive.google.com/file/d/1hYCsWnVWMsKujR-qBIjLlvW2rbPHeftE/view?usp=sharing) dataset are stored as `.parquet` files.
27+
28+
# Installation
29+
30+
To train the model the ml-garden library needs to be installed using `poetry`. For instructions on how to install `poetry` can be, please refer to the [poetry documentation](https://python-poetry.org/docs/#installing-with-pipx). Additionally we need to install `XGBoost`, which is the model used for training.
31+
32+
Use the following lines of code to install the needed packages and set the environment.
33+
34+
```
35+
poetry init
36+
37+
poetry add git+ssh://git@github.com:tryolabs/ml-garden.git
38+
39+
poetry add xgboost
40+
41+
poetry shell
42+
```
43+
44+
# Preprocessing
45+
46+
The panel data was carefully analyzed to detect outliers based on statistical analysis.
47+
48+
The exact preprocessing can be found in the [EDA](EDA/eda.py) folder.
49+
50+
Note, that no data was removed from the dataframe in this step. This analysis was only to identify data that should be removed. The actual removal is defined in the `config.json`.
51+
52+
# Train Model
53+
54+
The configuration file `config.json` provides the training parameters for the final model that is deployed. More details on how to setup a configuration file, please refer to the [documentation of ml-garden](https://github.com/tryolabs/ml-garden/blob/main/documentation/user_guide.md). To train the model with the provided `config.json`, run the script
55+
56+
```
57+
python3 run_training.py
58+
```
59+
60+
# Results
61+
62+
The pipeline execution returns a DataContainer object, in our script called `data`. This object contains the raw input data as a Pandas dataframe, which can be accessed using `data.raw`. This object contains the results of all steps performed in the pipeline the training. The prediction results can be accessed via `data.flow`.
63+
64+
The results are stored in the folder `runs`, which will be created during execution, if it doesn’t exist. The configuration file and a file containing the evaluation metrics are stored in this folder.

‎config.json

+264
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
{
2+
"pipeline": {
3+
"name": "XGBoostTrainingPipeline",
4+
"description": "Training pipeline for XGBoost models.",
5+
"parameters": {
6+
"save_data_path": "ocf_pipeline.pkl",
7+
"target": "average_power_kw",
8+
"columns_to_ignore_for_training": [
9+
"ss_id",
10+
"date"
11+
]
12+
},
13+
"steps": [
14+
{
15+
"step_type": "GenerateStep",
16+
"parameters": {
17+
"train_path": "data/trainset_new.parquet",
18+
"test_path": "data/testset_forecast.parquet",
19+
"predict_path": "data/testset_forecast.parquet",
20+
"drop_columns": [
21+
"operational_at",
22+
"total_energy_kwh",
23+
"terrestrial_radiation",
24+
"shortwave_radiation",
25+
"direct_normal_irradiance"
26+
]
27+
}
28+
},
29+
{
30+
"step_type": "TabularSplitStep",
31+
"parameters": {
32+
"train_percentage": 0.95,
33+
"group_by_columns": [
34+
"ss_id"
35+
]
36+
}
37+
},
38+
{
39+
"step_type": "CleanStep",
40+
"parameters": {
41+
"filter": {
42+
"is_day": "is_day != 0"
43+
},
44+
"drop_na_columns": [
45+
"average_power_kw",
46+
"diffuse_radiation"
47+
],
48+
"drop_ids": {
49+
"ss_id": [
50+
6656,
51+
3074,
52+
6663,
53+
6667,
54+
7184,
55+
3093,
56+
6682,
57+
10791,
58+
5177,
59+
6721,
60+
7750,
61+
6732,
62+
3149,
63+
13388,
64+
6748,
65+
2657,
66+
13415,
67+
7276,
68+
7292,
69+
7295,
70+
3208,
71+
6800,
72+
7312,
73+
5780,
74+
26777,
75+
26778,
76+
7835,
77+
26782,
78+
26788,
79+
7846,
80+
6826,
81+
26795,
82+
26800,
83+
3250,
84+
26803,
85+
26805,
86+
26807,
87+
7865,
88+
6843,
89+
6846,
90+
6336,
91+
26819,
92+
3270,
93+
26822,
94+
26825,
95+
6865,
96+
2770,
97+
7383,
98+
3288,
99+
6872,
100+
6877,
101+
26845,
102+
6880,
103+
6881,
104+
26849,
105+
3811,
106+
6372,
107+
26853,
108+
26856,
109+
26858,
110+
26859,
111+
6380,
112+
26860,
113+
3311,
114+
6896,
115+
26865,
116+
6898,
117+
26867,
118+
6902,
119+
3323,
120+
3324,
121+
3326,
122+
26887,
123+
6409,
124+
6927,
125+
2832,
126+
2834,
127+
26899,
128+
10004,
129+
26902,
130+
6424,
131+
3865,
132+
7448,
133+
17693,
134+
10531,
135+
6949,
136+
6952,
137+
6953,
138+
7469,
139+
6446,
140+
7471,
141+
26925,
142+
26926,
143+
26936,
144+
26939,
145+
26941,
146+
4421,
147+
11590,
148+
4422,
149+
6981,
150+
26951,
151+
6994,
152+
2902,
153+
5974,
154+
6490,
155+
6491,
156+
6493,
157+
26974,
158+
26976,
159+
26978,
160+
12644,
161+
6503,
162+
26985,
163+
26989,
164+
3951,
165+
3952,
166+
7025,
167+
26991,
168+
26998,
169+
27000,
170+
27002,
171+
10619,
172+
2940,
173+
27003,
174+
27006,
175+
6527,
176+
7551,
177+
6021,
178+
27016,
179+
7050,
180+
27019,
181+
3472,
182+
3476,
183+
7060,
184+
7062,
185+
27038,
186+
27046,
187+
3496,
188+
27048,
189+
7595,
190+
27051,
191+
27055,
192+
9648,
193+
6577,
194+
7088,
195+
7090,
196+
27059,
197+
4033,
198+
8648,
199+
6602,
200+
7114,
201+
7119,
202+
3536,
203+
3026,
204+
6611,
205+
7124,
206+
6614,
207+
6621,
208+
11752,
209+
6126,
210+
7159,
211+
4090,
212+
4092
213+
]
214+
}
215+
}
216+
},
217+
{
218+
"step_type": "CalculateFeaturesStep",
219+
"parameters": {
220+
"datetime_columns": [
221+
"date"
222+
],
223+
"features": [
224+
"month",
225+
"day",
226+
"hour"
227+
]
228+
}
229+
},
230+
{
231+
"step_type": "EncodeStep",
232+
"parameters": {}
233+
},
234+
{
235+
"step_type": "ModelStep",
236+
"parameters": {
237+
"model_class": "XGBoost",
238+
"model_parameters": {
239+
"max_depth": 15,
240+
"eta": 0.03932150362959542,
241+
"objective": "reg:squarederror",
242+
"eval_metric": "mae",
243+
"n_jobs": -1,
244+
"n_estimators": 2288,
245+
"min_child_weight": 1,
246+
"subsample": 0.8885899409499547,
247+
"colsample_bytree": 0.8439451149438386,
248+
"early_stopping_rounds": 20,
249+
"tree_method": "hist"
250+
}
251+
}
252+
},
253+
{
254+
"step_type": "CalculateMetricsStep"
255+
},
256+
{
257+
"step_type": "ExplainerDashboardStep",
258+
"parameters": {
259+
"enable_step": false
260+
}
261+
}
262+
]
263+
}
264+
}

‎poetry.lock

+3,266
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎pyproject.toml

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
[tool.poetry]
2+
name = "solar-training-prediction"
3+
version = "0.1.0"
4+
description = ""
5+
authors = ["froukje <falbrechtg@gmail.com>"]
6+
readme = "README.md"
7+
8+
[tool.poetry.dependencies]
9+
python = "^3.11"
10+
xgboost = "^2.0.3"
11+
ml-garden = {git = "git@github.com:tryolabs/ml-garden.git"}
12+
13+
14+
[build-system]
15+
requires = ["poetry-core"]
16+
build-backend = "poetry.core.masonry.api"

‎run_training.py

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import logging
2+
3+
from ml_garden import Pipeline
4+
5+
logging.basicConfig(level=logging.DEBUG)
6+
7+
data = Pipeline.from_json("config.json").run(is_train=True)

0 commit comments

Comments
 (0)
Please sign in to comment.