|
| 1 | +# %% |
| 2 | +import random |
| 3 | + |
| 4 | +import cartopy.crs as crs |
| 5 | +import matplotlib.pyplot as plt |
| 6 | +import numpy as np |
| 7 | +import pandas as pd |
| 8 | + |
| 9 | +# %% |
| 10 | +df_train = pd.read_parquet("../data/trainset_new.parquet") |
| 11 | +df_test = pd.read_parquet("../data/testset_forecast.parquet") |
| 12 | + |
| 13 | +df_loc_train = df_train[["latitude_rounded", "longitude_rounded"]].drop_duplicates() |
| 14 | +print( |
| 15 | + f"train latitude range: {df_loc_train['latitude_rounded'].min()}," |
| 16 | + f" {df_loc_train['latitude_rounded'].max()}" |
| 17 | +) |
| 18 | +print( |
| 19 | + f"train longitude range: {df_loc_train['longitude_rounded'].min()}," |
| 20 | + f" {df_loc_train['longitude_rounded'].max()}" |
| 21 | +) |
| 22 | + |
| 23 | +df_loc_test = df_test[["latitude_rounded", "longitude_rounded"]].drop_duplicates() |
| 24 | +print( |
| 25 | + f"test latitude range: {df_loc_test['latitude_rounded'].min()}," |
| 26 | + f" {df_loc_test['latitude_rounded'].max()}" |
| 27 | +) |
| 28 | +print( |
| 29 | + f"test longitude range: {df_loc_test['longitude_rounded'].min()}," |
| 30 | + f" {df_loc_test['longitude_rounded'].max()}" |
| 31 | +) |
| 32 | + |
| 33 | +start_date = df_train["date"].min() |
| 34 | +end_date = df_train["date"].max() |
| 35 | +# %% |
| 36 | +df_train.columns |
| 37 | +# %% |
| 38 | +# plot random station |
| 39 | +train_ids = df_train["ss_id"].unique() |
| 40 | +test_ids = df_test["ss_id"].unique() |
| 41 | +idx = random.choice(train_ids) |
| 42 | +df_train[df_train["ss_id"] == idx][["average_power_kw", "date"]].plot( |
| 43 | + x="date", y="average_power_kw", title=f"{idx}" |
| 44 | +) |
| 45 | +# %% |
| 46 | +# from visual inspection exclude the following panels |
| 47 | +# 17693: inconsistency in time series |
| 48 | +# 6611, 7750, 6927, 26865: only 0 entries |
| 49 | +outliers_train = [] |
| 50 | +outliers_test = [] |
| 51 | +save = False |
| 52 | +outliers_train.extend([17693, 6611, 7750, 6927, 26865]) |
| 53 | + |
| 54 | + |
| 55 | +# %% |
| 56 | +class CleanOutliers: |
| 57 | + """calculate different statistics for the dataset and outliers to be removed""" |
| 58 | + |
| 59 | + def __init__(self, train: bool, outliers: list): |
| 60 | + """initialize arguments |
| 61 | +
|
| 62 | + Args: |
| 63 | + train (bool): if True conditions for training set are used, |
| 64 | + else conditions for test set are used |
| 65 | + outliers (list): list of outliers to start with |
| 66 | + """ |
| 67 | + self.train = train |
| 68 | + self.outliers = outliers |
| 69 | + |
| 70 | + def apply_filter(self, df: pd.DataFrame, filter: dict) -> pd.DataFrame: |
| 71 | + """applies defined filter to the dataframe, keeps only the data defined in the filter |
| 72 | +
|
| 73 | + Returns: |
| 74 | + pandas dataframe: dataframe after applying the filter |
| 75 | + """ |
| 76 | + print(f"data length before filtering: {len(df)}") |
| 77 | + |
| 78 | + for key, value in filter.items(): |
| 79 | + df = df[df[key] == value] |
| 80 | + self.ids = df["ss_id"].unique() |
| 81 | + self.df_station = pd.DataFrame(data={"ss_id": self.ids}) |
| 82 | + print(f"remaining data length after filtering: {len(df)}") |
| 83 | + return df |
| 84 | + |
| 85 | + def get_night_energy(self, df: pd.DataFrame, criterion: dict) -> pd.DataFrame: |
| 86 | + """get different statistics for the energy at night |
| 87 | +
|
| 88 | + Args: |
| 89 | + df (pd.DataFrame): dataframe containing the data |
| 90 | + criterion (dict): criterion to remove outliers |
| 91 | +
|
| 92 | + Returns: |
| 93 | + pd.DataFrame: dataframe with statistics for each panel |
| 94 | + """ |
| 95 | + df_night_sum = df.groupby("ss_id").apply(lambda x: self._night_sum(x)).reset_index() |
| 96 | + df_night_sum.rename(columns={0: "total_power_at_night_kw"}, inplace=True) |
| 97 | + df_night_mean = df.groupby("ss_id").apply(lambda x: self._night_mean(x)).reset_index() |
| 98 | + df_night_mean.rename(columns={0: "mean_power_at_night_kw"}, inplace=True) |
| 99 | + df_night_median = df.groupby("ss_id").apply(lambda x: self._night_median(x)).reset_index() |
| 100 | + df_night_median.rename(columns={0: "median_power_at_night_kw"}, inplace=True) |
| 101 | + self.df_station = df_night_sum.merge(df_night_mean, on="ss_id") |
| 102 | + self.df_station = self.df_station.merge(df_night_median, on="ss_id") |
| 103 | + |
| 104 | + idx_outliers = [] |
| 105 | + for key, value in criterion.items(): |
| 106 | + df_out = self.df_station[self.df_station[key] >= value] |
| 107 | + idx_outliers.extend(list(df_out["ss_id"].values)) |
| 108 | + self.outliers.extend(idx_outliers) |
| 109 | + |
| 110 | + return self.df_station, idx_outliers |
| 111 | + |
| 112 | + def get_statistics(self, df: pd.DataFrame) -> pd.DataFrame: |
| 113 | + """calculate some statistics for individual panels |
| 114 | +
|
| 115 | + Args: |
| 116 | + df (pandas dataframe): complete dataframe with all panels |
| 117 | +
|
| 118 | + Returns: |
| 119 | + pandas datafame: dataframe with statistics for each panel in ids |
| 120 | + """ |
| 121 | + means = df.groupby("ss_id")["average_power_kw"].mean() |
| 122 | + means_winter = df[df["date"].dt.month.isin([12, 1, 2])]["average_power_kw"].mean() |
| 123 | + means_spring = df[df["date"].dt.month.isin([3, 4, 5])]["average_power_kw"].mean() |
| 124 | + means_summer = df[df["date"].dt.month.isin([6, 7, 8])]["average_power_kw"].mean() |
| 125 | + means_autumn = df[df["date"].dt.month.isin([9, 10, 11])]["average_power_kw"].mean() |
| 126 | + medians = df.groupby("ss_id")["average_power_kw"].median() |
| 127 | + stds = df.groupby("ss_id")["average_power_kw"].std() |
| 128 | + lats = df.groupby("ss_id")["latitude_rounded"].mean() |
| 129 | + lons = df.groupby("ss_id")["longitude_rounded"].mean() |
| 130 | + |
| 131 | + s_date = df.groupby("ss_id")["date"].min() |
| 132 | + e_date = df.groupby("ss_id")["date"].max() |
| 133 | + |
| 134 | + data = { |
| 135 | + "mean": means, |
| 136 | + "median": medians, |
| 137 | + "std": stds, |
| 138 | + "lat": lats, |
| 139 | + "lon": lons, |
| 140 | + "start_date": s_date, |
| 141 | + "end_date": e_date, |
| 142 | + "mean_winter": means_winter, |
| 143 | + "mean_spring": means_spring, |
| 144 | + "mean_summer": means_summer, |
| 145 | + "mean _autumn": means_autumn, |
| 146 | + } |
| 147 | + df_statistics = pd.DataFrame(data=data) |
| 148 | + self.df_station = df_statistics.merge(self.df_station, on="ss_id") |
| 149 | + return self.df_station |
| 150 | + |
| 151 | + def get_data_availability(self, df: pd.DataFrame) -> pd.DataFrame: |
| 152 | + """get information about the data availability |
| 153 | +
|
| 154 | + Args: |
| 155 | + df (pd.DataFrame): complete dataframe with all panels |
| 156 | +
|
| 157 | + Returns: |
| 158 | + pd.DataFrame: dataframe with info for data availability for each panel in ids |
| 159 | + """ |
| 160 | + data = { |
| 161 | + "ss_id": [], |
| 162 | + "total_length": [], |
| 163 | + "data_length": [], |
| 164 | + "time_span": [], |
| 165 | + "missing_values_total": [], |
| 166 | + "missing_values_span": [], |
| 167 | + } |
| 168 | + for id_ in self.ids: |
| 169 | + # missing values for entire time frame |
| 170 | + df_tmp = df[df["ss_id"] == id_] |
| 171 | + if df_tmp.empty: |
| 172 | + self.outliers.extend([id_]) |
| 173 | + else: |
| 174 | + s_date = df_tmp["date"].min() |
| 175 | + e_date = df_tmp["date"].max() |
| 176 | + df_span = pd.DataFrame( |
| 177 | + pd.date_range(start=s_date, end=e_date, freq="h"), columns=["date"] |
| 178 | + ) |
| 179 | + # missing values over available time |
| 180 | + df_time = pd.DataFrame( |
| 181 | + pd.date_range(start="2018-01-01", end="2021-11-09", freq="h"), columns=["date"] |
| 182 | + ) |
| 183 | + # total nr of samples available |
| 184 | + da = len(df_time[df_time.date.isin(df_tmp.date)]) |
| 185 | + data["ss_id"].append(id_) |
| 186 | + data["total_length"].append(len(df_time)) |
| 187 | + data["data_length"].append(da) |
| 188 | + data["time_span"].append(len(df_span)) |
| 189 | + data["missing_values_total"].append(len(df_time) - da) |
| 190 | + data["missing_values_span"].append(len(df_span) - da) |
| 191 | + df_data = pd.DataFrame(data=data) |
| 192 | + self.df_station = df_data.merge(self.df_station, on="ss_id") |
| 193 | + return self.df_station |
| 194 | + |
| 195 | + def remove_zero_sequences(self, df: pd.DataFrame, criterion: dict) -> pd.DataFrame: |
| 196 | + """remove sequences of zeros in target data |
| 197 | +
|
| 198 | + Args: |
| 199 | + df (pd.DataFrame): complete dataframe with all panels |
| 200 | +
|
| 201 | + Returns: |
| 202 | + pd.DataFrame: complete dataframe with all panels |
| 203 | + with samples removed that are a sequence of zeros |
| 204 | + """ |
| 205 | + n = criterion["zero_sequence_length"] |
| 206 | + is_zero = df[target] == 0 |
| 207 | + # Use cumsum to create groups of consecutive zeros |
| 208 | + zero_groups = is_zero.ne(is_zero.shift()).cumsum() |
| 209 | + |
| 210 | + # Filter groups where target is zero and count each group's size |
| 211 | + filtered_groups = df[is_zero].groupby(zero_groups).filter(lambda x: len(x) < n) |
| 212 | + valid_indices = df[is_zero].loc[filtered_groups.index].index |
| 213 | + |
| 214 | + # Combine indices of non-zero rows with valid zero sequence rows |
| 215 | + non_zero_indices = df[~is_zero].index |
| 216 | + all_valid_indices = non_zero_indices.union(valid_indices).sort_values() |
| 217 | + # Return the DataFrame with only the valid indices |
| 218 | + return df.loc[all_valid_indices] |
| 219 | + |
| 220 | + def remove_outliers(self, df_statistics: pd.DataFrame, criterion: dict) -> pd.DataFrame: |
| 221 | + """removes outliers depending on "criterion" from the training data |
| 222 | +
|
| 223 | + Args: |
| 224 | + df_statistics (pd.DataFrame): dataframe with statistics for each panel in ids |
| 225 | + criterion (dict): criterion to remove outliers |
| 226 | +
|
| 227 | + Returns: |
| 228 | + pd.DataFrame: dataframe with statistics for each panel in ids with outliers removed |
| 229 | + lsit: list of outliers ids |
| 230 | + """ |
| 231 | + if self.train: |
| 232 | + # remove time series that have few data |
| 233 | + outlier_idx = df_statistics[df_statistics["time_span"] < criterion["min_time_span"]][ |
| 234 | + "ss_id" |
| 235 | + ].values |
| 236 | + self.outliers.extend(outlier_idx) |
| 237 | + # remove data with more than p% missing values of available time span |
| 238 | + p = criterion["percentage_missing"] |
| 239 | + outlier_idx = df_statistics[ |
| 240 | + df_statistics["missing_values_span"] / df_statistics["time_span"] > p |
| 241 | + ]["ss_id"].values |
| 242 | + self.outliers.extend(outlier_idx) |
| 243 | + # remove panels with median above p-percentile |
| 244 | + p = np.percentile(df_statistics["median"], q=[criterion["percentile"]])[0] |
| 245 | + outlier_idx = df_statistics[df_statistics["median"] > p]["ss_id"].values |
| 246 | + self.outliers.extend(outlier_idx) |
| 247 | + self.outliers = list(set(self.outliers)) |
| 248 | + |
| 249 | + df_statistics = df_statistics[~df_statistics["ss_id"].isin(self.outliers)] |
| 250 | + return df_statistics, self.outliers |
| 251 | + |
| 252 | + def __call__( |
| 253 | + self, df: pd.DataFrame, night_criterion: dict, statistics_criterion: dict |
| 254 | + ) -> pd.DataFrame: |
| 255 | + """removes outliers from dataframe and returns statistics of the remaining panels |
| 256 | +
|
| 257 | + Args: |
| 258 | + df (pd.DataFrame): complete dataframe with all panels |
| 259 | +
|
| 260 | + Returns: |
| 261 | + pd.DataFrame: dataframe with statistics for each panel for all ids |
| 262 | + pd.DataFrame: dataframe with statistics for each panel in ids with outliers removed |
| 263 | + lsit: list of outliers ids |
| 264 | + """ |
| 265 | + clean_outliers = CleanOutliers(train=self.train, outliers=self.outliers) |
| 266 | + df = clean_outliers.apply_filter(df, filter) |
| 267 | + df = clean_outliers.remove_zero_sequences(df, statistics_criterion) |
| 268 | + df_statistics, _ = clean_outliers.get_night_energy(df, night_criterion) |
| 269 | + df_statistics = clean_outliers.get_statistics(df) |
| 270 | + df_statistics = clean_outliers.get_data_availability(df) |
| 271 | + df_statistics_clean, outliers = clean_outliers.remove_outliers( |
| 272 | + df_statistics, statistics_criterion |
| 273 | + ) |
| 274 | + return df_statistics, df_statistics_clean, outliers |
| 275 | + |
| 276 | + def _night_sum(self, df_loc): |
| 277 | + """get total sum of energy at night |
| 278 | +
|
| 279 | + Returns: |
| 280 | + _type_: _description_ |
| 281 | + """ |
| 282 | + night_sum = df_loc[df_loc["is_day"] == 0][target].sum() |
| 283 | + return night_sum |
| 284 | + |
| 285 | + def _night_mean(self, df_loc): |
| 286 | + """get mean of energy at night |
| 287 | +
|
| 288 | + Returns: |
| 289 | + _type_: _description_ |
| 290 | + """ |
| 291 | + night_mean = df_loc[df_loc["is_day"] == 0][target].mean() |
| 292 | + return night_mean |
| 293 | + |
| 294 | + def _night_median(self, df_loc): |
| 295 | + """get median of energy at night |
| 296 | +
|
| 297 | + Returns: |
| 298 | + _type_: _description_ |
| 299 | + """ |
| 300 | + night_median = df_loc[df_loc["is_day"] == 0][target].median() |
| 301 | + return night_median |
| 302 | + |
| 303 | + |
| 304 | +# %% |
| 305 | +target = "average_power_kw" |
| 306 | +# dictionaries to filter data |
| 307 | +filter = {"is_day": 1} |
| 308 | +night_criterion = {"total_power_at_night_kw": 100} |
| 309 | +statistics_criterion = { |
| 310 | + "min_time_span": 720, |
| 311 | + "percentage_missing": 0.9, |
| 312 | + "percentile": 90, |
| 313 | + "zero_sequence_length": 30, |
| 314 | +} |
| 315 | + |
| 316 | +clean_outliers = CleanOutliers(train=True, outliers=outliers_train) |
| 317 | +df_statistics_train, df_statistics_train_clean, outliers_train = clean_outliers( |
| 318 | + df_train, night_criterion, statistics_criterion |
| 319 | +) |
| 320 | +print(len(outliers_train)) |
| 321 | + |
| 322 | +clean_outliers = CleanOutliers(train=False, outliers=outliers_test) |
| 323 | +df_statistics_test, df_statistics_test_clean, outliers_test = clean_outliers( |
| 324 | + df_test, night_criterion, statistics_criterion |
| 325 | +) |
| 326 | +print(len(outliers_test)) |
| 327 | +# %% |
| 328 | +# boxplots |
| 329 | +fig, ax = plt.subplots(1, 3, figsize=(10, 5)) |
| 330 | +df_statistics_train.boxplot(column=["mean", "median", "std", "mean_summer"], ax=ax[0]) |
| 331 | +ax[0].set_title("train") |
| 332 | +ax[0].set_ylim(0, 1) |
| 333 | +df_statistics_train_clean.boxplot(column=["mean", "median", "std", "mean_summer"], ax=ax[1]) |
| 334 | +ax[1].set_title("train clean") |
| 335 | +ax[1].set_ylim(0, 1) |
| 336 | +ax[2] = df_statistics_test.boxplot(column=["mean", "median", "std", "mean_summer"], ax=ax[2]) |
| 337 | +ax[2].set_title("test") |
| 338 | +ax[2].set_ylim(0, 1) |
| 339 | +if save: |
| 340 | + plt.savefig("boxplot_only_day.png") |
| 341 | +# %% |
| 342 | +# spatial plots of statistical variables |
| 343 | +var = "median" |
| 344 | +df1 = df_statistics_train_clean |
| 345 | +df2 = df_statistics_test |
| 346 | +x1 = df1.lon |
| 347 | +y1 = df1.lat |
| 348 | +data1 = df1[var] |
| 349 | +title1 = f"train - {var} ({len(df1)})" |
| 350 | +x2 = df2.lon |
| 351 | +y2 = df2.lat |
| 352 | +data2 = df2[var] |
| 353 | +title2 = f"test - {var} ({len(df2)})" |
| 354 | +vmin = 0 |
| 355 | +vmax = np.max([data1.max(), data2.max()]) |
| 356 | + |
| 357 | +fig = plt.figure(figsize=(15, 6)) |
| 358 | +cm = plt.colormaps["RdYlBu"] |
| 359 | +ax = fig.add_subplot(1, 2, 1, projection=crs.PlateCarree()) |
| 360 | +ax.coastlines() |
| 361 | +gl = ax.gridlines(draw_labels=True) |
| 362 | +gl.top_labels = False |
| 363 | +gl.left_labels = False |
| 364 | +lat1, lon1, lat2, lon2 = 50.0, -5.7, 59.0, 1.8 |
| 365 | +ax.set_extent([lon1, lon2, lat1, lat2], crs=crs.PlateCarree()) |
| 366 | +sc = plt.scatter( |
| 367 | + x=x1, y=y1, c=data1, vmin=vmin, vmax=vmax, cmap=cm, s=2, alpha=0.8, transform=crs.PlateCarree() |
| 368 | +) |
| 369 | +plt.colorbar(sc) |
| 370 | +plt.title(title1) |
| 371 | +ax = fig.add_subplot(1, 2, 2, projection=crs.PlateCarree()) |
| 372 | +ax.coastlines() |
| 373 | +gl = ax.gridlines(draw_labels=True) |
| 374 | +gl.top_labels = False |
| 375 | +gl.left_labels = False |
| 376 | +lat1, lon1, lat2, lon2 = 50.0, -5.7, 59.0, 1.8 |
| 377 | +ax.set_extent([lon1, lon2, lat1, lat2], crs=crs.PlateCarree()) |
| 378 | +sc = plt.scatter( |
| 379 | + x=x2, y=y2, c=data2, vmin=vmin, vmax=vmax, cmap=cm, s=2, alpha=0.8, transform=crs.PlateCarree() |
| 380 | +) |
| 381 | +plt.colorbar(sc) |
| 382 | +plt.title(title2) |
| 383 | +if save: |
| 384 | + plt.savefig(f"{var}_only_day.png") |
| 385 | + |
| 386 | +# %% |
0 commit comments