From 31e19add2745ec90c66adbd8a5adedbcb1ac7cf3 Mon Sep 17 00:00:00 2001 From: robcaulk Date: Fri, 26 May 2023 18:40:14 +0200 Subject: [PATCH 01/32] start transition toward outsourcing the data pipeline with objective of improving pipeline flexibility --- .../freqai/base_models/BasePyTorchModel.py | 75 +- .../freqai/base_models/BaseRegressionModel.py | 37 +- .../freqai/base_models/BaseTensorFlowModel.py | 70 -- freqtrade/freqai/data_drawer.py | 15 + freqtrade/freqai/data_kitchen.py | 843 +++++++++--------- freqtrade/freqai/freqai_interface.py | 28 + requirements-freqai.txt | 1 + tests/freqai/test_freqai_datakitchen.py | 96 +- 8 files changed, 579 insertions(+), 586 deletions(-) delete mode 100644 freqtrade/freqai/base_models/BaseTensorFlowModel.py diff --git a/freqtrade/freqai/base_models/BasePyTorchModel.py b/freqtrade/freqai/base_models/BasePyTorchModel.py index 82042d24c..21dc4e894 100644 --- a/freqtrade/freqai/base_models/BasePyTorchModel.py +++ b/freqtrade/freqai/base_models/BasePyTorchModel.py @@ -7,14 +7,15 @@ import torch from pandas import DataFrame from freqtrade.freqai.data_kitchen import FreqaiDataKitchen -from freqtrade.freqai.freqai_interface import IFreqaiModel +# from freqtrade.freqai.freqai_interface import IFreqaiModel +from freqtrade.freqai.base_models import BaseRegressionModel from freqtrade.freqai.torch.PyTorchDataConvertor import PyTorchDataConvertor logger = logging.getLogger(__name__) -class BasePyTorchModel(IFreqaiModel, ABC): +class BasePyTorchModel(BaseRegressionModel): """ Base class for PyTorch type models. User *must* inherit from this class and set fit() and predict() and @@ -29,50 +30,50 @@ class BasePyTorchModel(IFreqaiModel, ABC): self.splits = ["train", "test"] if test_size != 0 else ["train"] self.window_size = self.freqai_info.get("conv_width", 1) - def train( - self, unfiltered_df: DataFrame, pair: str, dk: FreqaiDataKitchen, **kwargs - ) -> Any: - """ - Filter the training data and train a model to it. Train makes heavy use of the datakitchen - for storing, saving, loading, and analyzing the data. - :param unfiltered_df: Full dataframe for the current training period - :return: - :model: Trained model which can be used to inference (self.predict) - """ + # def train( + # self, unfiltered_df: DataFrame, pair: str, dk: FreqaiDataKitchen, **kwargs + # ) -> Any: + # """ + # Filter the training data and train a model to it. Train makes heavy use of the datakitchen + # for storing, saving, loading, and analyzing the data. + # :param unfiltered_df: Full dataframe for the current training period + # :return: + # :model: Trained model which can be used to inference (self.predict) + # """ - logger.info(f"-------------------- Starting training {pair} --------------------") + # logger.info(f"-------------------- Starting training {pair} --------------------") - start_time = time() + # start_time = time() - features_filtered, labels_filtered = dk.filter_features( - unfiltered_df, - dk.training_features_list, - dk.label_list, - training_filter=True, - ) + # features_filtered, labels_filtered = dk.filter_features( + # unfiltered_df, + # dk.training_features_list, + # dk.label_list, + # training_filter=True, + # ) - # split data into train/test data. - data_dictionary = dk.make_train_test_datasets(features_filtered, labels_filtered) - if not self.freqai_info.get("fit_live_predictions", 0) or not self.live: - dk.fit_labels() - # normalize all data based on train_dataset only - data_dictionary = dk.normalize_data(data_dictionary) + # # split data into train/test data. + # data_dictionary = dk.make_train_test_datasets(features_filtered, labels_filtered) + # if not self.freqai_info.get("fit_live_predictions", 0) or not self.live: + # dk.fit_labels() + # # normalize all data based on train_dataset only + # data_dictionary = dk.normalize_data(data_dictionary) - # optional additional data cleaning/analysis - self.data_cleaning_train(dk) + # # optional additional data cleaning/analysis + # self.data_cleaning_train(dk) - logger.info( - f"Training model on {len(dk.data_dictionary['train_features'].columns)} features" - ) - logger.info(f"Training model on {len(data_dictionary['train_features'])} data points") + # logger.info( + # f"Training model on {len(dk.data_dictionary['train_features'].columns)} features" + # ) + # logger.info(f"Training model on {len(data_dictionary['train_features'])} data points") - model = self.fit(data_dictionary, dk) - end_time = time() + # model = self.fit(data_dictionary, dk) + # end_time = time() - logger.info(f"-------------------- Done training {pair} " - f"({end_time - start_time:.2f} secs) --------------------") + # logger.info(f"-------------------- Done training {pair} " + # f"({end_time - start_time:.2f} secs) --------------------") - return model + # return model @property @abstractmethod diff --git a/freqtrade/freqai/base_models/BaseRegressionModel.py b/freqtrade/freqai/base_models/BaseRegressionModel.py index 1f9b4f5a6..45660253e 100644 --- a/freqtrade/freqai/base_models/BaseRegressionModel.py +++ b/freqtrade/freqai/base_models/BaseRegressionModel.py @@ -49,21 +49,34 @@ class BaseRegressionModel(IFreqaiModel): logger.info(f"-------------------- Training on data from {start_date} to " f"{end_date} --------------------") # split data into train/test data. - data_dictionary = dk.make_train_test_datasets(features_filtered, labels_filtered) + d = dk.make_train_test_datasets(features_filtered, labels_filtered) if not self.freqai_info.get("fit_live_predictions_candles", 0) or not self.live: dk.fit_labels() - # normalize all data based on train_dataset only - data_dictionary = dk.normalize_data(data_dictionary) - # optional additional data cleaning/analysis - self.data_cleaning_train(dk) + self.define_data_pipeline(dk) + self.define_label_pipeline(dk) + + d["train_labels"], _, _ = dk.label_pipeline.fit_transform(d["train_labels"]) + d["test_labels"], _, _ = dk.label_pipeline.transform(d["test_labels"]) + + (d["train_features"], + d["train_labels"], + d["train_weights"]) = dk.pipeline.fit_transform(d["train_features"], + d["train_labels"], + d["train_weights"]) + + (d["test_features"], + d["test_labels"], + d["test_weights"]) = dk.pipeline.transform(d["test_features"], + d["test_labels"], + d["test_weights"]) logger.info( f"Training model on {len(dk.data_dictionary['train_features'].columns)} features" ) - logger.info(f"Training model on {len(data_dictionary['train_features'])} data points") + logger.info(f"Training model on {len(d['train_features'])} data points") - model = self.fit(data_dictionary, dk) + model = self.fit(d, dk) end_time = time() @@ -88,11 +101,11 @@ class BaseRegressionModel(IFreqaiModel): filtered_df, _ = dk.filter_features( unfiltered_df, dk.training_features_list, training_filter=False ) - filtered_df = dk.normalize_data_from_metadata(filtered_df) + # filtered_df = dk.normalize_data_from_metadata(filtered_df) dk.data_dictionary["prediction_features"] = filtered_df - # optional additional data cleaning/analysis - self.data_cleaning_predict(dk) + dk.data_dictionary["prediction_features"], outliers, _ = dk.pipeline.transform( + dk.data_dictionary["prediction_features"], outlier_check=True) predictions = self.model.predict(dk.data_dictionary["prediction_features"]) if self.CONV_WIDTH == 1: @@ -100,6 +113,8 @@ class BaseRegressionModel(IFreqaiModel): pred_df = DataFrame(predictions, columns=dk.label_list) - pred_df = dk.denormalize_labels_from_metadata(pred_df) + pred_df, _, _ = dk.label_pipeline.inverse_transform(pred_df) + dk.DI_values = dk.label_pipeline.get_step("di").di_values + dk.do_predict = outliers.to_numpy() return (pred_df, dk.do_predict) diff --git a/freqtrade/freqai/base_models/BaseTensorFlowModel.py b/freqtrade/freqai/base_models/BaseTensorFlowModel.py deleted file mode 100644 index b41ee0175..000000000 --- a/freqtrade/freqai/base_models/BaseTensorFlowModel.py +++ /dev/null @@ -1,70 +0,0 @@ -import logging -from time import time -from typing import Any - -from pandas import DataFrame - -from freqtrade.freqai.data_kitchen import FreqaiDataKitchen -from freqtrade.freqai.freqai_interface import IFreqaiModel - - -logger = logging.getLogger(__name__) - - -class BaseTensorFlowModel(IFreqaiModel): - """ - Base class for TensorFlow type models. - User *must* inherit from this class and set fit() and predict(). - """ - - def train( - self, unfiltered_df: DataFrame, pair: str, dk: FreqaiDataKitchen, **kwargs - ) -> Any: - """ - Filter the training data and train a model to it. Train makes heavy use of the datakitchen - for storing, saving, loading, and analyzing the data. - :param unfiltered_df: Full dataframe for the current training period - :param metadata: pair metadata from strategy. - :return: - :model: Trained model which can be used to inference (self.predict) - """ - - logger.info(f"-------------------- Starting training {pair} --------------------") - - start_time = time() - - # filter the features requested by user in the configuration file and elegantly handle NaNs - features_filtered, labels_filtered = dk.filter_features( - unfiltered_df, - dk.training_features_list, - dk.label_list, - training_filter=True, - ) - - start_date = unfiltered_df["date"].iloc[0].strftime("%Y-%m-%d") - end_date = unfiltered_df["date"].iloc[-1].strftime("%Y-%m-%d") - logger.info(f"-------------------- Training on data from {start_date} to " - f"{end_date} --------------------") - # split data into train/test data. - data_dictionary = dk.make_train_test_datasets(features_filtered, labels_filtered) - if not self.freqai_info.get("fit_live_predictions_candles", 0) or not self.live: - dk.fit_labels() - # normalize all data based on train_dataset only - data_dictionary = dk.normalize_data(data_dictionary) - - # optional additional data cleaning/analysis - self.data_cleaning_train(dk) - - logger.info( - f"Training model on {len(dk.data_dictionary['train_features'].columns)} features" - ) - logger.info(f"Training model on {len(data_dictionary['train_features'])} data points") - - model = self.fit(data_dictionary, dk) - - end_time = time() - - logger.info(f"-------------------- Done training {pair} " - f"({end_time - start_time:.2f} secs) --------------------") - - return model diff --git a/freqtrade/freqai/data_drawer.py b/freqtrade/freqai/data_drawer.py index b68a9dcad..9fdcc2d41 100644 --- a/freqtrade/freqai/data_drawer.py +++ b/freqtrade/freqai/data_drawer.py @@ -460,6 +460,13 @@ class FreqaiDataDrawer: with (save_path / f"{dk.model_filename}_metadata.json").open("w") as fp: rapidjson.dump(dk.data, fp, default=self.np_encoder, number_mode=rapidjson.NM_NATIVE) + # save the pipelines to pickle files + with (save_path / f"{dk.model_filename}_pipeline.pkl").open("wb") as fp: + cloudpickle.dump(dk.pipeline, fp) + + with (save_path / f"{dk.model_filename}_label_pipeline.pkl").open("wb") as fp: + cloudpickle.dump(dk.label_pipeline, fp) + # save the train data to file so we can check preds for area of applicability later dk.data_dictionary["train_features"].to_pickle( save_path / f"{dk.model_filename}_trained_df.pkl" @@ -482,6 +489,8 @@ class FreqaiDataDrawer: self.meta_data_dictionary[coin] = {} self.meta_data_dictionary[coin]["train_df"] = dk.data_dictionary["train_features"] self.meta_data_dictionary[coin]["meta_data"] = dk.data + self.meta_data_dictionary[coin]["pipeline"] = dk.pipeline + self.meta_data_dictionary[coin]["label_pipeline"] = dk.label_pipeline self.save_drawer_to_disk() return @@ -513,6 +522,8 @@ class FreqaiDataDrawer: if coin in self.meta_data_dictionary: dk.data = self.meta_data_dictionary[coin]["meta_data"] dk.data_dictionary["train_features"] = self.meta_data_dictionary[coin]["train_df"] + dk.pipeline = self.meta_data_dictionary[coin]["pipeline"] + dk.label_pipeline = self.meta_data_dictionary[coin]["label_pipeline"] else: with (dk.data_path / f"{dk.model_filename}_metadata.json").open("r") as fp: dk.data = rapidjson.load(fp, number_mode=rapidjson.NM_NATIVE) @@ -520,6 +531,10 @@ class FreqaiDataDrawer: dk.data_dictionary["train_features"] = pd.read_pickle( dk.data_path / f"{dk.model_filename}_trained_df.pkl" ) + with (dk.data_path / f"{dk.model_filename}_pipeline.pkl").open("rb") as fp: + dk.pipeline = cloudpickle.load(fp) + with (dk.data_path / f"{dk.model_filename}_label_pipeline.pkl").open("rb") as fp: + dk.label_pipeline = cloudpickle.load(fp) dk.training_features_list = dk.data["training_features_list"] dk.label_list = dk.data["label_list"] diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index 21b41db2d..adfeb8dd5 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -27,6 +27,7 @@ from freqtrade.exceptions import OperationalException from freqtrade.exchange import timeframe_to_seconds from freqtrade.strategy import merge_informative_pair from freqtrade.strategy.interface import IStrategy +from datasieve.pipeline import Pipeline SECONDS_IN_DAY = 86400 @@ -86,6 +87,8 @@ class FreqaiDataKitchen: self.keras: bool = self.freqai_config.get("keras", False) self.set_all_pairs() self.backtest_live_models = config.get("freqai_backtest_live_models", False) + self.pipeline = Pipeline() + self.label_pipeline = Pipeline() if not self.live: self.full_path = self.get_full_models_path(self.config) @@ -307,106 +310,106 @@ class FreqaiDataKitchen: return self.data_dictionary - def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]: - """ - Normalize all data in the data_dictionary according to the training dataset - :param data_dictionary: dictionary containing the cleaned and - split training/test data/labels - :returns: - :data_dictionary: updated dictionary with standardized values. - """ + # def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]: + # """ + # Normalize all data in the data_dictionary according to the training dataset + # :param data_dictionary: dictionary containing the cleaned and + # split training/test data/labels + # :returns: + # :data_dictionary: updated dictionary with standardized values. + # """ - # standardize the data by training stats - train_max = data_dictionary["train_features"].max() - train_min = data_dictionary["train_features"].min() - data_dictionary["train_features"] = ( - 2 * (data_dictionary["train_features"] - train_min) / (train_max - train_min) - 1 - ) - data_dictionary["test_features"] = ( - 2 * (data_dictionary["test_features"] - train_min) / (train_max - train_min) - 1 - ) + # # standardize the data by training stats + # train_max = data_dictionary["train_features"].max() + # train_min = data_dictionary["train_features"].min() + # data_dictionary["train_features"] = ( + # 2 * (data_dictionary["train_features"] - train_min) / (train_max - train_min) - 1 + # ) + # data_dictionary["test_features"] = ( + # 2 * (data_dictionary["test_features"] - train_min) / (train_max - train_min) - 1 + # ) - for item in train_max.keys(): - self.data[item + "_max"] = train_max[item] - self.data[item + "_min"] = train_min[item] + # for item in train_max.keys(): + # self.data[item + "_max"] = train_max[item] + # self.data[item + "_min"] = train_min[item] - for item in data_dictionary["train_labels"].keys(): - if data_dictionary["train_labels"][item].dtype == object: - continue - train_labels_max = data_dictionary["train_labels"][item].max() - train_labels_min = data_dictionary["train_labels"][item].min() - data_dictionary["train_labels"][item] = ( - 2 - * (data_dictionary["train_labels"][item] - train_labels_min) - / (train_labels_max - train_labels_min) - - 1 - ) - if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0: - data_dictionary["test_labels"][item] = ( - 2 - * (data_dictionary["test_labels"][item] - train_labels_min) - / (train_labels_max - train_labels_min) - - 1 - ) + # for item in data_dictionary["train_labels"].keys(): + # if data_dictionary["train_labels"][item].dtype == object: + # continue + # train_labels_max = data_dictionary["train_labels"][item].max() + # train_labels_min = data_dictionary["train_labels"][item].min() + # data_dictionary["train_labels"][item] = ( + # 2 + # * (data_dictionary["train_labels"][item] - train_labels_min) + # / (train_labels_max - train_labels_min) + # - 1 + # ) + # if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0: + # data_dictionary["test_labels"][item] = ( + # 2 + # * (data_dictionary["test_labels"][item] - train_labels_min) + # / (train_labels_max - train_labels_min) + # - 1 + # ) - self.data[f"{item}_max"] = train_labels_max - self.data[f"{item}_min"] = train_labels_min - return data_dictionary + # self.data[f"{item}_max"] = train_labels_max + # self.data[f"{item}_min"] = train_labels_min + # return data_dictionary - def normalize_single_dataframe(self, df: DataFrame) -> DataFrame: + # def normalize_single_dataframe(self, df: DataFrame) -> DataFrame: - train_max = df.max() - train_min = df.min() - df = ( - 2 * (df - train_min) / (train_max - train_min) - 1 - ) + # train_max = df.max() + # train_min = df.min() + # df = ( + # 2 * (df - train_min) / (train_max - train_min) - 1 + # ) - for item in train_max.keys(): - self.data[item + "_max"] = train_max[item] - self.data[item + "_min"] = train_min[item] + # for item in train_max.keys(): + # self.data[item + "_max"] = train_max[item] + # self.data[item + "_min"] = train_min[item] - return df + # return df - def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame: - """ - Normalize a set of data using the mean and standard deviation from - the associated training data. - :param df: Dataframe to be standardized - """ + # def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame: + # """ + # Normalize a set of data using the mean and standard deviation from + # the associated training data. + # :param df: Dataframe to be standardized + # """ - train_max = [None] * len(df.keys()) - train_min = [None] * len(df.keys()) + # train_max = [None] * len(df.keys()) + # train_min = [None] * len(df.keys()) - for i, item in enumerate(df.keys()): - train_max[i] = self.data[f"{item}_max"] - train_min[i] = self.data[f"{item}_min"] + # for i, item in enumerate(df.keys()): + # train_max[i] = self.data[f"{item}_max"] + # train_min[i] = self.data[f"{item}_min"] - train_max_series = pd.Series(train_max, index=df.keys()) - train_min_series = pd.Series(train_min, index=df.keys()) + # train_max_series = pd.Series(train_max, index=df.keys()) + # train_min_series = pd.Series(train_min, index=df.keys()) - df = ( - 2 * (df - train_min_series) / (train_max_series - train_min_series) - 1 - ) + # df = ( + # 2 * (df - train_min_series) / (train_max_series - train_min_series) - 1 + # ) - return df + # return df - def denormalize_labels_from_metadata(self, df: DataFrame) -> DataFrame: - """ - Denormalize a set of data using the mean and standard deviation from - the associated training data. - :param df: Dataframe of predictions to be denormalized - """ + # def denormalize_labels_from_metadata(self, df: DataFrame) -> DataFrame: + # """ + # Denormalize a set of data using the mean and standard deviation from + # the associated training data. + # :param df: Dataframe of predictions to be denormalized + # """ - for label in df.columns: - if df[label].dtype == object or label in self.unique_class_list: - continue - df[label] = ( - (df[label] + 1) - * (self.data[f"{label}_max"] - self.data[f"{label}_min"]) - / 2 - ) + self.data[f"{label}_min"] + # for label in df.columns: + # if df[label].dtype == object or label in self.unique_class_list: + # continue + # df[label] = ( + # (df[label] + 1) + # * (self.data[f"{label}_max"] - self.data[f"{label}_min"]) + # / 2 + # ) + self.data[f"{label}_min"] - return df + # return df def split_timerange( self, tr: str, train_split: int = 28, bt_split: float = 7 @@ -501,398 +504,398 @@ class FreqaiDataKitchen: return df_predictions - def principal_component_analysis(self) -> None: - """ - Performs Principal Component Analysis on the data for dimensionality reduction - and outlier detection (see self.remove_outliers()) - No parameters or returns, it acts on the data_dictionary held by the DataHandler. - """ + # def principal_component_analysis(self) -> None: + # """ + # Performs Principal Component Analysis on the data for dimensionality reduction + # and outlier detection (see self.remove_outliers()) + # No parameters or returns, it acts on the data_dictionary held by the DataHandler. + # """ - from sklearn.decomposition import PCA # avoid importing if we dont need it + # from sklearn.decomposition import PCA # avoid importing if we dont need it - pca = PCA(0.999) - pca = pca.fit(self.data_dictionary["train_features"]) - n_keep_components = pca.n_components_ - self.data["n_kept_components"] = n_keep_components - n_components = self.data_dictionary["train_features"].shape[1] - logger.info("reduced feature dimension by %s", n_components - n_keep_components) - logger.info("explained variance %f", np.sum(pca.explained_variance_ratio_)) + # pca = PCA(0.999) + # pca = pca.fit(self.data_dictionary["train_features"]) + # n_keep_components = pca.n_components_ + # self.data["n_kept_components"] = n_keep_components + # n_components = self.data_dictionary["train_features"].shape[1] + # logger.info("reduced feature dimension by %s", n_components - n_keep_components) + # logger.info("explained variance %f", np.sum(pca.explained_variance_ratio_)) - train_components = pca.transform(self.data_dictionary["train_features"]) - self.data_dictionary["train_features"] = pd.DataFrame( - data=train_components, - columns=["PC" + str(i) for i in range(0, n_keep_components)], - index=self.data_dictionary["train_features"].index, - ) - # normalsing transformed training features - self.data_dictionary["train_features"] = self.normalize_single_dataframe( - self.data_dictionary["train_features"]) + # train_components = pca.transform(self.data_dictionary["train_features"]) + # self.data_dictionary["train_features"] = pd.DataFrame( + # data=train_components, + # columns=["PC" + str(i) for i in range(0, n_keep_components)], + # index=self.data_dictionary["train_features"].index, + # ) + # # normalsing transformed training features + # self.data_dictionary["train_features"] = self.normalize_single_dataframe( + # self.data_dictionary["train_features"]) - # keeping a copy of the non-transformed features so we can check for errors during - # model load from disk - self.data["training_features_list_raw"] = copy.deepcopy(self.training_features_list) - self.training_features_list = self.data_dictionary["train_features"].columns + # # keeping a copy of the non-transformed features so we can check for errors during + # # model load from disk + # self.data["training_features_list_raw"] = copy.deepcopy(self.training_features_list) + # self.training_features_list = self.data_dictionary["train_features"].columns - if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0: - test_components = pca.transform(self.data_dictionary["test_features"]) - self.data_dictionary["test_features"] = pd.DataFrame( - data=test_components, - columns=["PC" + str(i) for i in range(0, n_keep_components)], - index=self.data_dictionary["test_features"].index, - ) - # normalise transformed test feature to transformed training features - self.data_dictionary["test_features"] = self.normalize_data_from_metadata( - self.data_dictionary["test_features"]) + # if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0: + # test_components = pca.transform(self.data_dictionary["test_features"]) + # self.data_dictionary["test_features"] = pd.DataFrame( + # data=test_components, + # columns=["PC" + str(i) for i in range(0, n_keep_components)], + # index=self.data_dictionary["test_features"].index, + # ) + # # normalise transformed test feature to transformed training features + # self.data_dictionary["test_features"] = self.normalize_data_from_metadata( + # self.data_dictionary["test_features"]) - self.data["n_kept_components"] = n_keep_components - self.pca = pca + # self.data["n_kept_components"] = n_keep_components + # self.pca = pca - logger.info(f"PCA reduced total features from {n_components} to {n_keep_components}") + # logger.info(f"PCA reduced total features from {n_components} to {n_keep_components}") - if not self.data_path.is_dir(): - self.data_path.mkdir(parents=True, exist_ok=True) + # if not self.data_path.is_dir(): + # self.data_path.mkdir(parents=True, exist_ok=True) - return None + # return None - def pca_transform(self, filtered_dataframe: DataFrame) -> None: - """ - Use an existing pca transform to transform data into components - :param filtered_dataframe: DataFrame = the cleaned dataframe - """ - pca_components = self.pca.transform(filtered_dataframe) - self.data_dictionary["prediction_features"] = pd.DataFrame( - data=pca_components, - columns=["PC" + str(i) for i in range(0, self.data["n_kept_components"])], - index=filtered_dataframe.index, - ) - # normalise transformed predictions to transformed training features - self.data_dictionary["prediction_features"] = self.normalize_data_from_metadata( - self.data_dictionary["prediction_features"]) + # def pca_transform(self, filtered_dataframe: DataFrame) -> None: + # """ + # Use an existing pca transform to transform data into components + # :param filtered_dataframe: DataFrame = the cleaned dataframe + # """ + # pca_components = self.pca.transform(filtered_dataframe) + # self.data_dictionary["prediction_features"] = pd.DataFrame( + # data=pca_components, + # columns=["PC" + str(i) for i in range(0, self.data["n_kept_components"])], + # index=filtered_dataframe.index, + # ) + # # normalise transformed predictions to transformed training features + # self.data_dictionary["prediction_features"] = self.normalize_data_from_metadata( + # self.data_dictionary["prediction_features"]) - def compute_distances(self) -> float: - """ - Compute distances between each training point and every other training - point. This metric defines the neighborhood of trained data and is used - for prediction confidence in the Dissimilarity Index - """ - # logger.info("computing average mean distance for all training points") - pairwise = pairwise_distances( - self.data_dictionary["train_features"], n_jobs=self.thread_count) - # remove the diagonal distances which are itself distances ~0 - np.fill_diagonal(pairwise, np.NaN) - pairwise = pairwise.reshape(-1, 1) - avg_mean_dist = pairwise[~np.isnan(pairwise)].mean() + # def compute_distances(self) -> float: + # """ + # Compute distances between each training point and every other training + # point. This metric defines the neighborhood of trained data and is used + # for prediction confidence in the Dissimilarity Index + # """ + # # logger.info("computing average mean distance for all training points") + # pairwise = pairwise_distances( + # self.data_dictionary["train_features"], n_jobs=self.thread_count) + # # remove the diagonal distances which are itself distances ~0 + # np.fill_diagonal(pairwise, np.NaN) + # pairwise = pairwise.reshape(-1, 1) + # avg_mean_dist = pairwise[~np.isnan(pairwise)].mean() - return avg_mean_dist + # return avg_mean_dist - def get_outlier_percentage(self, dropped_pts: npt.NDArray) -> float: - """ - Check if more than X% of points werer dropped during outlier detection. - """ - outlier_protection_pct = self.freqai_config["feature_parameters"].get( - "outlier_protection_percentage", 30) - outlier_pct = (dropped_pts.sum() / len(dropped_pts)) * 100 - if outlier_pct >= outlier_protection_pct: - return outlier_pct - else: - return 0.0 + # def get_outlier_percentage(self, dropped_pts: npt.NDArray) -> float: + # """ + # Check if more than X% of points werer dropped during outlier detection. + # """ + # outlier_protection_pct = self.freqai_config["feature_parameters"].get( + # "outlier_protection_percentage", 30) + # outlier_pct = (dropped_pts.sum() / len(dropped_pts)) * 100 + # if outlier_pct >= outlier_protection_pct: + # return outlier_pct + # else: + # return 0.0 - def use_SVM_to_remove_outliers(self, predict: bool) -> None: - """ - Build/inference a Support Vector Machine to detect outliers - in training data and prediction - :param predict: bool = If true, inference an existing SVM model, else construct one - """ + # def use_SVM_to_remove_outliers(self, predict: bool) -> None: + # """ + # Build/inference a Support Vector Machine to detect outliers + # in training data and prediction + # :param predict: bool = If true, inference an existing SVM model, else construct one + # """ - if self.keras: - logger.warning( - "SVM outlier removal not currently supported for Keras based models. " - "Skipping user requested function." - ) - if predict: - self.do_predict = np.ones(len(self.data_dictionary["prediction_features"])) - return + # if self.keras: + # logger.warning( + # "SVM outlier removal not currently supported for Keras based models. " + # "Skipping user requested function." + # ) + # if predict: + # self.do_predict = np.ones(len(self.data_dictionary["prediction_features"])) + # return - if predict: - if not self.svm_model: - logger.warning("No svm model available for outlier removal") - return - y_pred = self.svm_model.predict(self.data_dictionary["prediction_features"]) - do_predict = np.where(y_pred == -1, 0, y_pred) + # if predict: + # if not self.svm_model: + # logger.warning("No svm model available for outlier removal") + # return + # y_pred = self.svm_model.predict(self.data_dictionary["prediction_features"]) + # do_predict = np.where(y_pred == -1, 0, y_pred) - if (len(do_predict) - do_predict.sum()) > 0: - logger.info(f"SVM tossed {len(do_predict) - do_predict.sum()} predictions.") - self.do_predict += do_predict - self.do_predict -= 1 + # if (len(do_predict) - do_predict.sum()) > 0: + # logger.info(f"SVM tossed {len(do_predict) - do_predict.sum()} predictions.") + # self.do_predict += do_predict + # self.do_predict -= 1 - else: - # use SGDOneClassSVM to increase speed? - svm_params = self.freqai_config["feature_parameters"].get( - "svm_params", {"shuffle": False, "nu": 0.1}) - self.svm_model = linear_model.SGDOneClassSVM(**svm_params).fit( - self.data_dictionary["train_features"] - ) - y_pred = self.svm_model.predict(self.data_dictionary["train_features"]) - kept_points = np.where(y_pred == -1, 0, y_pred) - # keep_index = np.where(y_pred == 1) - outlier_pct = self.get_outlier_percentage(1 - kept_points) - if outlier_pct: - logger.warning( - f"SVM detected {outlier_pct:.2f}% of the points as outliers. " - f"Keeping original dataset." - ) - self.svm_model = None - return + # else: + # # use SGDOneClassSVM to increase speed? + # svm_params = self.freqai_config["feature_parameters"].get( + # "svm_params", {"shuffle": False, "nu": 0.1}) + # self.svm_model = linear_model.SGDOneClassSVM(**svm_params).fit( + # self.data_dictionary["train_features"] + # ) + # y_pred = self.svm_model.predict(self.data_dictionary["train_features"]) + # kept_points = np.where(y_pred == -1, 0, y_pred) + # # keep_index = np.where(y_pred == 1) + # outlier_pct = self.get_outlier_percentage(1 - kept_points) + # if outlier_pct: + # logger.warning( + # f"SVM detected {outlier_pct:.2f}% of the points as outliers. " + # f"Keeping original dataset." + # ) + # self.svm_model = None + # return - self.data_dictionary["train_features"] = self.data_dictionary["train_features"][ - (y_pred == 1) - ] - self.data_dictionary["train_labels"] = self.data_dictionary["train_labels"][ - (y_pred == 1) - ] - self.data_dictionary["train_weights"] = self.data_dictionary["train_weights"][ - (y_pred == 1) - ] + # self.data_dictionary["train_features"] = self.data_dictionary["train_features"][ + # (y_pred == 1) + # ] + # self.data_dictionary["train_labels"] = self.data_dictionary["train_labels"][ + # (y_pred == 1) + # ] + # self.data_dictionary["train_weights"] = self.data_dictionary["train_weights"][ + # (y_pred == 1) + # ] - logger.info( - f"SVM tossed {len(y_pred) - kept_points.sum()}" - f" train points from {len(y_pred)} total points." - ) + # logger.info( + # f"SVM tossed {len(y_pred) - kept_points.sum()}" + # f" train points from {len(y_pred)} total points." + # ) - # same for test data - # TODO: This (and the part above) could be refactored into a separate function - # to reduce code duplication - if self.freqai_config['data_split_parameters'].get('test_size', 0.1) != 0: - y_pred = self.svm_model.predict(self.data_dictionary["test_features"]) - kept_points = np.where(y_pred == -1, 0, y_pred) - self.data_dictionary["test_features"] = self.data_dictionary["test_features"][ - (y_pred == 1) - ] - self.data_dictionary["test_labels"] = self.data_dictionary["test_labels"][( - y_pred == 1)] - self.data_dictionary["test_weights"] = self.data_dictionary["test_weights"][ - (y_pred == 1) - ] + # # same for test data + # # TODO: This (and the part above) could be refactored into a separate function + # # to reduce code duplication + # if self.freqai_config['data_split_parameters'].get('test_size', 0.1) != 0: + # y_pred = self.svm_model.predict(self.data_dictionary["test_features"]) + # kept_points = np.where(y_pred == -1, 0, y_pred) + # self.data_dictionary["test_features"] = self.data_dictionary["test_features"][ + # (y_pred == 1) + # ] + # self.data_dictionary["test_labels"] = self.data_dictionary["test_labels"][( + # y_pred == 1)] + # self.data_dictionary["test_weights"] = self.data_dictionary["test_weights"][ + # (y_pred == 1) + # ] - logger.info( - f"{self.pair}: SVM tossed {len(y_pred) - kept_points.sum()}" - f" test points from {len(y_pred)} total points." - ) + # logger.info( + # f"{self.pair}: SVM tossed {len(y_pred) - kept_points.sum()}" + # f" test points from {len(y_pred)} total points." + # ) - return + # return - def use_DBSCAN_to_remove_outliers(self, predict: bool, eps=None) -> None: - """ - Use DBSCAN to cluster training data and remove "noisy" data (read outliers). - User controls this via the config param `DBSCAN_outlier_pct` which indicates the - pct of training data that they want to be considered outliers. - :param predict: bool = If False (training), iterate to find the best hyper parameters - to match user requested outlier percent target. - If True (prediction), use the parameters determined from - the previous training to estimate if the current prediction point - is an outlier. - """ + # def use_DBSCAN_to_remove_outliers(self, predict: bool, eps=None) -> None: + # """ + # Use DBSCAN to cluster training data and remove "noisy" data (read outliers). + # User controls this via the config param `DBSCAN_outlier_pct` which indicates the + # pct of training data that they want to be considered outliers. + # :param predict: bool = If False (training), iterate to find the best hyper parameters + # to match user requested outlier percent target. + # If True (prediction), use the parameters determined from + # the previous training to estimate if the current prediction point + # is an outlier. + # """ - if predict: - if not self.data['DBSCAN_eps']: - return - train_ft_df = self.data_dictionary['train_features'] - pred_ft_df = self.data_dictionary['prediction_features'] - num_preds = len(pred_ft_df) - df = pd.concat([train_ft_df, pred_ft_df], axis=0, ignore_index=True) - clustering = DBSCAN(eps=self.data['DBSCAN_eps'], - min_samples=self.data['DBSCAN_min_samples'], - n_jobs=self.thread_count - ).fit(df) - do_predict = np.where(clustering.labels_[-num_preds:] == -1, 0, 1) + # if predict: + # if not self.data['DBSCAN_eps']: + # return + # train_ft_df = self.data_dictionary['train_features'] + # pred_ft_df = self.data_dictionary['prediction_features'] + # num_preds = len(pred_ft_df) + # df = pd.concat([train_ft_df, pred_ft_df], axis=0, ignore_index=True) + # clustering = DBSCAN(eps=self.data['DBSCAN_eps'], + # min_samples=self.data['DBSCAN_min_samples'], + # n_jobs=self.thread_count + # ).fit(df) + # do_predict = np.where(clustering.labels_[-num_preds:] == -1, 0, 1) - if (len(do_predict) - do_predict.sum()) > 0: - logger.info(f"DBSCAN tossed {len(do_predict) - do_predict.sum()} predictions") - self.do_predict += do_predict - self.do_predict -= 1 + # if (len(do_predict) - do_predict.sum()) > 0: + # logger.info(f"DBSCAN tossed {len(do_predict) - do_predict.sum()} predictions") + # self.do_predict += do_predict + # self.do_predict -= 1 - else: + # else: - def normalise_distances(distances): - normalised_distances = (distances - distances.min()) / \ - (distances.max() - distances.min()) - return normalised_distances + # def normalise_distances(distances): + # normalised_distances = (distances - distances.min()) / \ + # (distances.max() - distances.min()) + # return normalised_distances - def rotate_point(origin, point, angle): - # rotate a point counterclockwise by a given angle (in radians) - # around a given origin - x = origin[0] + cos(angle) * (point[0] - origin[0]) - \ - sin(angle) * (point[1] - origin[1]) - y = origin[1] + sin(angle) * (point[0] - origin[0]) + \ - cos(angle) * (point[1] - origin[1]) - return (x, y) + # def rotate_point(origin, point, angle): + # # rotate a point counterclockwise by a given angle (in radians) + # # around a given origin + # x = origin[0] + cos(angle) * (point[0] - origin[0]) - \ + # sin(angle) * (point[1] - origin[1]) + # y = origin[1] + sin(angle) * (point[0] - origin[0]) + \ + # cos(angle) * (point[1] - origin[1]) + # return (x, y) - MinPts = int(len(self.data_dictionary['train_features'].index) * 0.25) - # measure pairwise distances to nearest neighbours - neighbors = NearestNeighbors( - n_neighbors=MinPts, n_jobs=self.thread_count) - neighbors_fit = neighbors.fit(self.data_dictionary['train_features']) - distances, _ = neighbors_fit.kneighbors(self.data_dictionary['train_features']) - distances = np.sort(distances, axis=0).mean(axis=1) + # MinPts = int(len(self.data_dictionary['train_features'].index) * 0.25) + # # measure pairwise distances to nearest neighbours + # neighbors = NearestNeighbors( + # n_neighbors=MinPts, n_jobs=self.thread_count) + # neighbors_fit = neighbors.fit(self.data_dictionary['train_features']) + # distances, _ = neighbors_fit.kneighbors(self.data_dictionary['train_features']) + # distances = np.sort(distances, axis=0).mean(axis=1) - normalised_distances = normalise_distances(distances) - x_range = np.linspace(0, 1, len(distances)) - line = np.linspace(normalised_distances[0], - normalised_distances[-1], len(normalised_distances)) - deflection = np.abs(normalised_distances - line) - max_deflection_loc = np.where(deflection == deflection.max())[0][0] - origin = x_range[max_deflection_loc], line[max_deflection_loc] - point = x_range[max_deflection_loc], normalised_distances[max_deflection_loc] - rot_angle = np.pi / 4 - elbow_loc = rotate_point(origin, point, rot_angle) + # normalised_distances = normalise_distances(distances) + # x_range = np.linspace(0, 1, len(distances)) + # line = np.linspace(normalised_distances[0], + # normalised_distances[-1], len(normalised_distances)) + # deflection = np.abs(normalised_distances - line) + # max_deflection_loc = np.where(deflection == deflection.max())[0][0] + # origin = x_range[max_deflection_loc], line[max_deflection_loc] + # point = x_range[max_deflection_loc], normalised_distances[max_deflection_loc] + # rot_angle = np.pi / 4 + # elbow_loc = rotate_point(origin, point, rot_angle) - epsilon = elbow_loc[1] * (distances[-1] - distances[0]) + distances[0] + # epsilon = elbow_loc[1] * (distances[-1] - distances[0]) + distances[0] - clustering = DBSCAN(eps=epsilon, min_samples=MinPts, - n_jobs=int(self.thread_count)).fit( - self.data_dictionary['train_features'] - ) + # clustering = DBSCAN(eps=epsilon, min_samples=MinPts, + # n_jobs=int(self.thread_count)).fit( + # self.data_dictionary['train_features'] + # ) - logger.info(f'DBSCAN found eps of {epsilon:.2f}.') + # logger.info(f'DBSCAN found eps of {epsilon:.2f}.') - self.data['DBSCAN_eps'] = epsilon - self.data['DBSCAN_min_samples'] = MinPts - dropped_points = np.where(clustering.labels_ == -1, 1, 0) + # self.data['DBSCAN_eps'] = epsilon + # self.data['DBSCAN_min_samples'] = MinPts + # dropped_points = np.where(clustering.labels_ == -1, 1, 0) - outlier_pct = self.get_outlier_percentage(dropped_points) - if outlier_pct: - logger.warning( - f"DBSCAN detected {outlier_pct:.2f}% of the points as outliers. " - f"Keeping original dataset." - ) - self.data['DBSCAN_eps'] = 0 - return + # outlier_pct = self.get_outlier_percentage(dropped_points) + # if outlier_pct: + # logger.warning( + # f"DBSCAN detected {outlier_pct:.2f}% of the points as outliers. " + # f"Keeping original dataset." + # ) + # self.data['DBSCAN_eps'] = 0 + # return - self.data_dictionary['train_features'] = self.data_dictionary['train_features'][ - (clustering.labels_ != -1) - ] - self.data_dictionary["train_labels"] = self.data_dictionary["train_labels"][ - (clustering.labels_ != -1) - ] - self.data_dictionary["train_weights"] = self.data_dictionary["train_weights"][ - (clustering.labels_ != -1) - ] + # self.data_dictionary['train_features'] = self.data_dictionary['train_features'][ + # (clustering.labels_ != -1) + # ] + # self.data_dictionary["train_labels"] = self.data_dictionary["train_labels"][ + # (clustering.labels_ != -1) + # ] + # self.data_dictionary["train_weights"] = self.data_dictionary["train_weights"][ + # (clustering.labels_ != -1) + # ] - logger.info( - f"DBSCAN tossed {dropped_points.sum()}" - f" train points from {len(clustering.labels_)}" - ) + # logger.info( + # f"DBSCAN tossed {dropped_points.sum()}" + # f" train points from {len(clustering.labels_)}" + # ) - return + # return - def compute_inlier_metric(self, set_='train') -> None: - """ - Compute inlier metric from backwards distance distributions. - This metric defines how well features from a timepoint fit - into previous timepoints. - """ + # def compute_inlier_metric(self, set_='train') -> None: + # """ + # Compute inlier metric from backwards distance distributions. + # This metric defines how well features from a timepoint fit + # into previous timepoints. + # """ - def normalise(dataframe: DataFrame, key: str) -> DataFrame: - if set_ == 'train': - min_value = dataframe.min() - max_value = dataframe.max() - self.data[f'{key}_min'] = min_value - self.data[f'{key}_max'] = max_value - else: - min_value = self.data[f'{key}_min'] - max_value = self.data[f'{key}_max'] - return (dataframe - min_value) / (max_value - min_value) + # def normalise(dataframe: DataFrame, key: str) -> DataFrame: + # if set_ == 'train': + # min_value = dataframe.min() + # max_value = dataframe.max() + # self.data[f'{key}_min'] = min_value + # self.data[f'{key}_max'] = max_value + # else: + # min_value = self.data[f'{key}_min'] + # max_value = self.data[f'{key}_max'] + # return (dataframe - min_value) / (max_value - min_value) - no_prev_pts = self.freqai_config["feature_parameters"]["inlier_metric_window"] + # no_prev_pts = self.freqai_config["feature_parameters"]["inlier_metric_window"] - if set_ == 'train': - compute_df = copy.deepcopy(self.data_dictionary['train_features']) - elif set_ == 'test': - compute_df = copy.deepcopy(self.data_dictionary['test_features']) - else: - compute_df = copy.deepcopy(self.data_dictionary['prediction_features']) + # if set_ == 'train': + # compute_df = copy.deepcopy(self.data_dictionary['train_features']) + # elif set_ == 'test': + # compute_df = copy.deepcopy(self.data_dictionary['test_features']) + # else: + # compute_df = copy.deepcopy(self.data_dictionary['prediction_features']) - compute_df_reindexed = compute_df.reindex( - index=np.flip(compute_df.index) - ) + # compute_df_reindexed = compute_df.reindex( + # index=np.flip(compute_df.index) + # ) - pairwise = pd.DataFrame( - np.triu( - pairwise_distances(compute_df_reindexed, n_jobs=self.thread_count) - ), - columns=compute_df_reindexed.index, - index=compute_df_reindexed.index - ) - pairwise = pairwise.round(5) + # pairwise = pd.DataFrame( + # np.triu( + # pairwise_distances(compute_df_reindexed, n_jobs=self.thread_count) + # ), + # columns=compute_df_reindexed.index, + # index=compute_df_reindexed.index + # ) + # pairwise = pairwise.round(5) - column_labels = [ - '{}{}'.format('d', i) for i in range(1, no_prev_pts + 1) - ] - distances = pd.DataFrame( - columns=column_labels, index=compute_df.index - ) + # column_labels = [ + # '{}{}'.format('d', i) for i in range(1, no_prev_pts + 1) + # ] + # distances = pd.DataFrame( + # columns=column_labels, index=compute_df.index + # ) - for index in compute_df.index[no_prev_pts:]: - current_row = pairwise.loc[[index]] - current_row_no_zeros = current_row.loc[ - :, (current_row != 0).any(axis=0) - ] - distances.loc[[index]] = current_row_no_zeros.iloc[ - :, :no_prev_pts - ] - distances = distances.replace([np.inf, -np.inf], np.nan) - drop_index = pd.isnull(distances).any(axis=1) - distances = distances[drop_index == 0] + # for index in compute_df.index[no_prev_pts:]: + # current_row = pairwise.loc[[index]] + # current_row_no_zeros = current_row.loc[ + # :, (current_row != 0).any(axis=0) + # ] + # distances.loc[[index]] = current_row_no_zeros.iloc[ + # :, :no_prev_pts + # ] + # distances = distances.replace([np.inf, -np.inf], np.nan) + # drop_index = pd.isnull(distances).any(axis=1) + # distances = distances[drop_index == 0] - inliers = pd.DataFrame(index=distances.index) - for key in distances.keys(): - current_distances = distances[key].dropna() - current_distances = normalise(current_distances, key) - if set_ == 'train': - fit_params = stats.weibull_min.fit(current_distances) - self.data[f'{key}_fit_params'] = fit_params - else: - fit_params = self.data[f'{key}_fit_params'] - quantiles = stats.weibull_min.cdf(current_distances, *fit_params) + # inliers = pd.DataFrame(index=distances.index) + # for key in distances.keys(): + # current_distances = distances[key].dropna() + # current_distances = normalise(current_distances, key) + # if set_ == 'train': + # fit_params = stats.weibull_min.fit(current_distances) + # self.data[f'{key}_fit_params'] = fit_params + # else: + # fit_params = self.data[f'{key}_fit_params'] + # quantiles = stats.weibull_min.cdf(current_distances, *fit_params) - df_inlier = pd.DataFrame( - {key: quantiles}, index=distances.index - ) - inliers = pd.concat( - [inliers, df_inlier], axis=1 - ) + # df_inlier = pd.DataFrame( + # {key: quantiles}, index=distances.index + # ) + # inliers = pd.concat( + # [inliers, df_inlier], axis=1 + # ) - inlier_metric = pd.DataFrame( - data=inliers.sum(axis=1) / no_prev_pts, - columns=['%-inlier_metric'], - index=compute_df.index - ) + # inlier_metric = pd.DataFrame( + # data=inliers.sum(axis=1) / no_prev_pts, + # columns=['%-inlier_metric'], + # index=compute_df.index + # ) - inlier_metric = (2 * (inlier_metric - inlier_metric.min()) / - (inlier_metric.max() - inlier_metric.min()) - 1) + # inlier_metric = (2 * (inlier_metric - inlier_metric.min()) / + # (inlier_metric.max() - inlier_metric.min()) - 1) - if set_ in ('train', 'test'): - inlier_metric = inlier_metric.iloc[no_prev_pts:] - compute_df = compute_df.iloc[no_prev_pts:] - self.remove_beginning_points_from_data_dict(set_, no_prev_pts) - self.data_dictionary[f'{set_}_features'] = pd.concat( - [compute_df, inlier_metric], axis=1) - else: - self.data_dictionary['prediction_features'] = pd.concat( - [compute_df, inlier_metric], axis=1) - self.data_dictionary['prediction_features'].fillna(0, inplace=True) + # if set_ in ('train', 'test'): + # inlier_metric = inlier_metric.iloc[no_prev_pts:] + # compute_df = compute_df.iloc[no_prev_pts:] + # self.remove_beginning_points_from_data_dict(set_, no_prev_pts) + # self.data_dictionary[f'{set_}_features'] = pd.concat( + # [compute_df, inlier_metric], axis=1) + # else: + # self.data_dictionary['prediction_features'] = pd.concat( + # [compute_df, inlier_metric], axis=1) + # self.data_dictionary['prediction_features'].fillna(0, inplace=True) - logger.info('Inlier metric computed and added to features.') + # logger.info('Inlier metric computed and added to features.') - return None + # return None - def remove_beginning_points_from_data_dict(self, set_='train', no_prev_pts: int = 10): - features = self.data_dictionary[f'{set_}_features'] - weights = self.data_dictionary[f'{set_}_weights'] - labels = self.data_dictionary[f'{set_}_labels'] - self.data_dictionary[f'{set_}_weights'] = weights[no_prev_pts:] - self.data_dictionary[f'{set_}_features'] = features.iloc[no_prev_pts:] - self.data_dictionary[f'{set_}_labels'] = labels.iloc[no_prev_pts:] + # def remove_beginning_points_from_data_dict(self, set_='train', no_prev_pts: int = 10): + # features = self.data_dictionary[f'{set_}_features'] + # weights = self.data_dictionary[f'{set_}_weights'] + # labels = self.data_dictionary[f'{set_}_labels'] + # self.data_dictionary[f'{set_}_weights'] = weights[no_prev_pts:] + # self.data_dictionary[f'{set_}_features'] = features.iloc[no_prev_pts:] + # self.data_dictionary[f'{set_}_labels'] = labels.iloc[no_prev_pts:] def add_noise_to_training_features(self) -> None: """ diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py index 9cfda05ee..cacbfea67 100644 --- a/freqtrade/freqai/freqai_interface.py +++ b/freqtrade/freqai/freqai_interface.py @@ -23,6 +23,8 @@ from freqtrade.freqai.data_drawer import FreqaiDataDrawer from freqtrade.freqai.data_kitchen import FreqaiDataKitchen from freqtrade.freqai.utils import get_tb_logger, plot_feature_importance, record_params from freqtrade.strategy.interface import IStrategy +from datasieve.pipeline import Pipeline +import datasieve.transforms as ds pd.options.mode.chained_assignment = None @@ -566,6 +568,32 @@ class IFreqaiModel(ABC): if ft_params.get("use_DBSCAN_to_remove_outliers", False): dk.use_DBSCAN_to_remove_outliers(predict=True) + def define_data_pipeline(self, dk: FreqaiDataKitchen) -> None: + ft_params = self.freqai_info["feature_parameters"] + dk.pipeline = Pipeline([('scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1)))]) + + if ft_params.get("principal_component_analysis", False): + dk.pipeline.steps += [('pca', ds.DataSievePCA())] + dk.pipeline.steps += [('post-pca-scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1)))] + + if ft_params.get("use_SVM_to_remove_outliers", False): + dk.pipeline.steps += [('svm', ds.SVMOutlierExtractor())] + + if ft_params.get("DI_threshold", 0): + dk.pipeline.steps += [('di', ds.DissimilarityIndex())] + + if ft_params.get("use_DBSCAN_to_remove_outliers", False): + dk.pipeline.steps += [('dbscan', ds.DataSieveDBSCAN())] + + dk.pipeline.fitparams = dk.pipeline._validate_fitparams({}, dk.pipeline.steps) + + # if self.freqai_info["feature_parameters"].get('noise_standard_deviation', 0): + # dk.pipeline.extend(('noise', ds.Noise())) + + def define_label_pipeline(self, dk: FreqaiDataKitchen) -> None: + + dk.label_pipeline = Pipeline([('scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1)))]) + def model_exists(self, dk: FreqaiDataKitchen) -> bool: """ Given a pair and path, check if a model already exists diff --git a/requirements-freqai.txt b/requirements-freqai.txt index ad069ade2..66da4e873 100644 --- a/requirements-freqai.txt +++ b/requirements-freqai.txt @@ -10,3 +10,4 @@ catboost==1.2; 'arm' not in platform_machine and (sys_platform != 'darwin' or py lightgbm==3.3.5 xgboost==1.7.5 tensorboard==2.13.0 +datasieve==0.0.5 diff --git a/tests/freqai/test_freqai_datakitchen.py b/tests/freqai/test_freqai_datakitchen.py index 13dc6b4b0..e3ef1612c 100644 --- a/tests/freqai/test_freqai_datakitchen.py +++ b/tests/freqai/test_freqai_datakitchen.py @@ -9,9 +9,9 @@ from freqtrade.configuration import TimeRange from freqtrade.data.dataprovider import DataProvider from freqtrade.exceptions import OperationalException from freqtrade.freqai.data_kitchen import FreqaiDataKitchen -from tests.conftest import get_patched_exchange, log_has_re +from tests.conftest import get_patched_exchange # , log_has_re from tests.freqai.conftest import (get_patched_data_kitchen, get_patched_freqai_strategy, - make_data_dictionary, make_unfiltered_dataframe) + make_unfiltered_dataframe) # make_data_dictionary, from tests.freqai.test_freqai_interface import is_mac @@ -72,66 +72,66 @@ def test_check_if_model_expired(mocker, freqai_conf): shutil.rmtree(Path(dk.full_path)) -def test_use_DBSCAN_to_remove_outliers(mocker, freqai_conf, caplog): - freqai = make_data_dictionary(mocker, freqai_conf) - # freqai_conf['freqai']['feature_parameters'].update({"outlier_protection_percentage": 1}) - freqai.dk.use_DBSCAN_to_remove_outliers(predict=False) - assert log_has_re(r"DBSCAN found eps of 1\.7\d\.", caplog) +# def test_use_DBSCAN_to_remove_outliers(mocker, freqai_conf, caplog): +# freqai = make_data_dictionary(mocker, freqai_conf) +# # freqai_conf['freqai']['feature_parameters'].update({"outlier_protection_percentage": 1}) +# freqai.dk.use_DBSCAN_to_remove_outliers(predict=False) +# assert log_has_re(r"DBSCAN found eps of 1\.7\d\.", caplog) -def test_compute_distances(mocker, freqai_conf): - freqai = make_data_dictionary(mocker, freqai_conf) - freqai_conf['freqai']['feature_parameters'].update({"DI_threshold": 1}) - avg_mean_dist = freqai.dk.compute_distances() - assert round(avg_mean_dist, 2) == 1.98 +# def test_compute_distances(mocker, freqai_conf): +# freqai = make_data_dictionary(mocker, freqai_conf) +# freqai_conf['freqai']['feature_parameters'].update({"DI_threshold": 1}) +# avg_mean_dist = freqai.dk.compute_distances() +# assert round(avg_mean_dist, 2) == 1.98 -def test_use_SVM_to_remove_outliers_and_outlier_protection(mocker, freqai_conf, caplog): - freqai = make_data_dictionary(mocker, freqai_conf) - freqai_conf['freqai']['feature_parameters'].update({"outlier_protection_percentage": 0.1}) - freqai.dk.use_SVM_to_remove_outliers(predict=False) - assert log_has_re( - "SVM detected 7.83%", - caplog, - ) +# def test_use_SVM_to_remove_outliers_and_outlier_protection(mocker, freqai_conf, caplog): +# freqai = make_data_dictionary(mocker, freqai_conf) +# freqai_conf['freqai']['feature_parameters'].update({"outlier_protection_percentage": 0.1}) +# freqai.dk.use_SVM_to_remove_outliers(predict=False) +# assert log_has_re( +# "SVM detected 7.83%", +# caplog, +# ) -def test_compute_inlier_metric(mocker, freqai_conf, caplog): - freqai = make_data_dictionary(mocker, freqai_conf) - freqai_conf['freqai']['feature_parameters'].update({"inlier_metric_window": 10}) - freqai.dk.compute_inlier_metric(set_='train') - assert log_has_re( - "Inlier metric computed and added to features.", - caplog, - ) +# def test_compute_inlier_metric(mocker, freqai_conf, caplog): +# freqai = make_data_dictionary(mocker, freqai_conf) +# freqai_conf['freqai']['feature_parameters'].update({"inlier_metric_window": 10}) +# freqai.dk.compute_inlier_metric(set_='train') +# assert log_has_re( +# "Inlier metric computed and added to features.", +# caplog, +# ) -def test_add_noise_to_training_features(mocker, freqai_conf): - freqai = make_data_dictionary(mocker, freqai_conf) - freqai_conf['freqai']['feature_parameters'].update({"noise_standard_deviation": 0.1}) - freqai.dk.add_noise_to_training_features() +# def test_add_noise_to_training_features(mocker, freqai_conf): +# freqai = make_data_dictionary(mocker, freqai_conf) +# freqai_conf['freqai']['feature_parameters'].update({"noise_standard_deviation": 0.1}) +# freqai.dk.add_noise_to_training_features() -def test_remove_beginning_points_from_data_dict(mocker, freqai_conf): - freqai = make_data_dictionary(mocker, freqai_conf) - freqai.dk.remove_beginning_points_from_data_dict(set_='train') +# def test_remove_beginning_points_from_data_dict(mocker, freqai_conf): +# freqai = make_data_dictionary(mocker, freqai_conf) +# freqai.dk.remove_beginning_points_from_data_dict(set_='train') -def test_principal_component_analysis(mocker, freqai_conf, caplog): - freqai = make_data_dictionary(mocker, freqai_conf) - freqai.dk.principal_component_analysis() - assert log_has_re( - "reduced feature dimension by", - caplog, - ) +# def test_principal_component_analysis(mocker, freqai_conf, caplog): +# freqai = make_data_dictionary(mocker, freqai_conf) +# freqai.dk.principal_component_analysis() +# assert log_has_re( +# "reduced feature dimension by", +# caplog, +# ) -def test_normalize_data(mocker, freqai_conf): - freqai = make_data_dictionary(mocker, freqai_conf) - data_dict = freqai.dk.data_dictionary - freqai.dk.normalize_data(data_dict) - assert any('_max' in entry for entry in freqai.dk.data.keys()) - assert any('_min' in entry for entry in freqai.dk.data.keys()) +# def test_normalize_data(mocker, freqai_conf): +# freqai = make_data_dictionary(mocker, freqai_conf) +# data_dict = freqai.dk.data_dictionary +# freqai.dk.normalize_data(data_dict) +# assert any('_max' in entry for entry in freqai.dk.data.keys()) +# assert any('_min' in entry for entry in freqai.dk.data.keys()) def test_filter_features(mocker, freqai_conf): From e57265361606dae1f34e957f675b333b73587a75 Mon Sep 17 00:00:00 2001 From: robcaulk Date: Mon, 29 May 2023 13:33:29 +0200 Subject: [PATCH 02/32] bring classifier/rl up to new paradigm. ensure tests pass. remove old code. add documentation, add new example transform --- docs/freqai-feature-engineering.md | 84 +++- .../RL/BaseReinforcementLearningModel.py | 45 +- .../freqai/base_models/BaseClassifierModel.py | 34 +- .../base_models/BasePyTorchClassifier.py | 72 ++- .../freqai/base_models/BasePyTorchModel.py | 54 +-- .../base_models/BasePyTorchRegressor.py | 74 ++- .../freqai/base_models/BaseRegressionModel.py | 29 +- freqtrade/freqai/data_drawer.py | 29 +- freqtrade/freqai/data_kitchen.py | 441 +----------------- freqtrade/freqai/freqai_interface.py | 93 +--- .../PyTorchTransformerRegressor.py | 16 +- .../prediction_models/XGBoostRFRegressor.py | 4 + .../prediction_models/XGBoostRegressor.py | 23 + freqtrade/freqai/transforms/__init__.py | 6 + .../freqai/transforms/quantile_transform.py | 28 ++ freqtrade/resolvers/freqaimodel_resolver.py | 2 +- tests/freqai/test_freqai_datakitchen.py | 66 +-- tests/freqai/test_freqai_interface.py | 44 +- 18 files changed, 390 insertions(+), 754 deletions(-) create mode 100644 freqtrade/freqai/transforms/__init__.py create mode 100644 freqtrade/freqai/transforms/quantile_transform.py diff --git a/docs/freqai-feature-engineering.md b/docs/freqai-feature-engineering.md index 82b7569a5..eb4b4272e 100644 --- a/docs/freqai-feature-engineering.md +++ b/docs/freqai-feature-engineering.md @@ -209,15 +209,67 @@ Another example, where the user wants to use live metrics from the trade databas You need to set the standard dictionary in the config so that FreqAI can return proper dataframe shapes. These values will likely be overridden by the prediction model, but in the case where the model has yet to set them, or needs a default initial value, the pre-set values are what will be returned. -## Feature normalization +### Weighting features for temporal importance -FreqAI is strict when it comes to data normalization. The train features, $X^{train}$, are always normalized to [-1, 1] using a shifted min-max normalization: +FreqAI allows you to set a `weight_factor` to weight recent data more strongly than past data via an exponential function: -$$X^{train}_{norm} = 2 * \frac{X^{train} - X^{train}.min()}{X^{train}.max() - X^{train}.min()} - 1$$ +$$ W_i = \exp(\frac{-i}{\alpha*n}) $$ -All other data (test data and unseen prediction data in dry/live/backtest) is always automatically normalized to the training feature space according to industry standards. FreqAI stores all the metadata required to ensure that test and prediction features will be properly normalized and that predictions are properly denormalized. For this reason, it is not recommended to eschew industry standards and modify FreqAI internals - however - advanced users can do so by inheriting `train()` in their custom `IFreqaiModel` and using their own normalization functions. +where $W_i$ is the weight of data point $i$ in a total set of $n$ data points. Below is a figure showing the effect of different weight factors on the data points in a feature set. -## Data dimensionality reduction with Principal Component Analysis +![weight-factor](assets/freqai_weight-factor.jpg) + +# Building the data pipeline + +FreqAI uses the the [`DataSieve`](https://github.com/emergentmethods/datasieve) pipeline, which follows the SKlearn pipeline API, but adds, among other features, coherence between the X, y, and sample_weight vector point removals, and feature removal feature name following. + +This means that users can use/customize any SKLearn modules and easily add them to their FreqAI data pipeline. By default, FreqAI builds the following pipeline: + +```py +dk.feature_pipeline = Pipeline([ + ('scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1))), + ('di', ds.DissimilarityIndex(di_threshold=1)), + ]) +``` + +But users will find that they can add PCA and other steps just by changing their configuration settings, for example, if you add `"principal_component_analysis": true` to the `feature_parameters` dict in the `freqai` config, then FreqAI will add the PCA step for you resulting in the following pipeline: + +```py +dk.feature_pipeline = Pipeline([ + ('scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1))), + ('pca', ds.DataSievePCA()), + ('post-pca-scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1))) + ('di', ds.DissimilarityIndex(di_threshold=1)), + ]) +``` + +The same concept follows if users activate other config options like `"use_SVM_to_remove_outliers": true` or `"use_DBSCAN_to_remove_outliers": true`. FreqAI will add the appropriate steps to the pipeline for you. + +## Customizing the pipeline + +Users are encouraged to customize the data pipeline to their needs by building their own data pipeline. This can be done by overriding `define_data_pipeline` in their `IFreqaiModel`. For example: + +```py + def define_data_pipeline(self, dk: FreqaiDataKitchen) -> None: + """ + User defines their custom eature pipeline here (if they wish) + """ + from freqtrade.freqai.transforms import FreqaiQuantileTransformer + dk.feature_pipeline = Pipeline([ + ('qt', FreqaiQuantileTransformer(output_distribution='normal')) + ]) + + return +``` + +Here, you are defining the exact pipeline that will be used for your feature set during training and prediction. If you have a custom step that you would like to add to the pipeline, you simply create a class that follows the DataSieve/SKLearn API. That means your step must have a `fit()`, `transform()`, `fit_transform()`, and `inverse_transform()` method. You can see examples of this in the `freqtrade.freqai.transforms` module where we use SKLearn `QuantileNormalization` to create a new step for the pipeline. + +As there is the `feature_pipeline`, there also exists a definition for the `label_pipeline` which can be defined the same way as the `feature_pipeline`, by overriding `define_label_pipeline`. + +!!! note "Inheritence required" + While most SKLearn methods are very easy to override, as shown in freqtrade/freqai/transforms/quantile_transform.py, they still need to include passing X, y, and sample_weights through all `fit()`, `transform()`, `fit_transform()` and `inverse_transform()` functions, even if that means a direct pass through without modifications. + + ## Outlier detection @@ -259,7 +301,7 @@ Equity and crypto markets suffer from a high level of non-patterned noise in the ### Identifying outliers with the Dissimilarity Index (DI) - The Dissimilarity Index (DI) aims to quantify the uncertainty associated with each prediction made by the model. +The Dissimilarity Index (DI) aims to quantify the uncertainty associated with each prediction made by the model. You can tell FreqAI to remove outlier data points from the training/test data sets using the DI by including the following statement in the config: @@ -271,7 +313,7 @@ You can tell FreqAI to remove outlier data points from the training/test data se } ``` - The DI allows predictions which are outliers (not existent in the model feature space) to be thrown out due to low levels of certainty. To do so, FreqAI measures the distance between each training data point (feature vector), $X_{a}$, and all other training data points: +Which will add `DissimilarityIndex` step to your `feature_pipeline` and set the threshold to 1. The DI allows predictions which are outliers (not existent in the model feature space) to be thrown out due to low levels of certainty. To do so, FreqAI measures the distance between each training data point (feature vector), $X_{a}$, and all other training data points: $$ d_{ab} = \sqrt{\sum_{j=1}^p(X_{a,j}-X_{b,j})^2} $$ @@ -305,9 +347,9 @@ You can tell FreqAI to remove outlier data points from the training/test data se } ``` -The SVM will be trained on the training data and any data point that the SVM deems to be beyond the feature space will be removed. +Which will add `SVMOutlierExtractor` step to your `feature_pipeline`. The SVM will be trained on the training data and any data point that the SVM deems to be beyond the feature space will be removed. -FreqAI uses `sklearn.linear_model.SGDOneClassSVM` (details are available on scikit-learn's webpage [here](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDOneClassSVM.html) (external website)) and you can elect to provide additional parameters for the SVM, such as `shuffle`, and `nu`. +You can elect to provide additional parameters for the SVM, such as `shuffle`, and `nu` via the `feature_parameters.svm_params` dictionary in the config. The parameter `shuffle` is by default set to `False` to ensure consistent results. If it is set to `True`, running the SVM multiple times on the same data set might result in different outcomes due to `max_iter` being to low for the algorithm to reach the demanded `tol`. Increasing `max_iter` solves this issue but causes the procedure to take longer time. @@ -325,7 +367,7 @@ You can configure FreqAI to use DBSCAN to cluster and remove outliers from the t } ``` -DBSCAN is an unsupervised machine learning algorithm that clusters data without needing to know how many clusters there should be. +Which will add the `DataSieveDBSCAN` step to your `feature_pipeline`. This is an unsupervised machine learning algorithm that clusters data without needing to know how many clusters there should be. Given a number of data points $N$, and a distance $\varepsilon$, DBSCAN clusters the data set by setting all data points that have $N-1$ other data points within a distance of $\varepsilon$ as *core points*. A data point that is within a distance of $\varepsilon$ from a *core point* but that does not have $N-1$ other data points within a distance of $\varepsilon$ from itself is considered an *edge point*. A cluster is then the collection of *core points* and *edge points*. Data points that have no other data points at a distance $<\varepsilon$ are considered outliers. The figure below shows a cluster with $N = 3$. diff --git a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py index 8ee3c7c56..bd22decaa 100644 --- a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py +++ b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py @@ -82,6 +82,9 @@ class BaseReinforcementLearningModel(IFreqaiModel): if self.ft_params.get('use_DBSCAN_to_remove_outliers', False): self.ft_params.update({'use_DBSCAN_to_remove_outliers': False}) logger.warning('User tried to use DBSCAN with RL. Deactivating DBSCAN.') + if self.ft_params.get('DI_threshold', False): + self.ft_params.update({'DI_threshold': False}) + logger.warning('User tried to use DI_threshold with RL. Deactivating DI_threshold.') if self.freqai_info['data_split_parameters'].get('shuffle', False): self.freqai_info['data_split_parameters'].update({'shuffle': False}) logger.warning('User tried to shuffle training data. Setting shuffle to False') @@ -107,27 +110,40 @@ class BaseReinforcementLearningModel(IFreqaiModel): training_filter=True, ) - data_dictionary: Dict[str, Any] = dk.make_train_test_datasets( + d: Dict[str, Any] = dk.make_train_test_datasets( features_filtered, labels_filtered) - self.df_raw = copy.deepcopy(data_dictionary["train_features"]) + self.df_raw = copy.deepcopy(d["train_features"]) dk.fit_labels() # FIXME useless for now, but just satiating append methods # normalize all data based on train_dataset only prices_train, prices_test = self.build_ohlc_price_dataframes(dk.data_dictionary, pair, dk) - data_dictionary = dk.normalize_data(data_dictionary) + self.define_data_pipeline(dk) + self.define_label_pipeline(dk) - # data cleaning/analysis - self.data_cleaning_train(dk) + # d["train_labels"], _, _ = dk.label_pipeline.fit_transform(d["train_labels"]) + # d["test_labels"], _, _ = dk.label_pipeline.transform(d["test_labels"]) + + (d["train_features"], + d["train_labels"], + d["train_weights"]) = dk.feature_pipeline.fit_transform(d["train_features"], + d["train_labels"], + d["train_weights"]) + + (d["test_features"], + d["test_labels"], + d["test_weights"]) = dk.feature_pipeline.transform(d["test_features"], + d["test_labels"], + d["test_weights"]) logger.info( f'Training model on {len(dk.data_dictionary["train_features"].columns)}' - f' features and {len(data_dictionary["train_features"])} data points' + f' features and {len(d["train_features"])} data points' ) - self.set_train_and_eval_environments(data_dictionary, prices_train, prices_test, dk) + self.set_train_and_eval_environments(d, prices_train, prices_test, dk) - model = self.fit(data_dictionary, dk) + model = self.fit(d, dk) logger.info(f"--------------------done training {pair}--------------------") @@ -236,18 +252,19 @@ class BaseReinforcementLearningModel(IFreqaiModel): unfiltered_df, dk.training_features_list, training_filter=False ) - filtered_dataframe = self.drop_ohlc_from_df(filtered_dataframe, dk) + dk.data_dictionary["prediction_features"] = self.drop_ohlc_from_df(filtered_dataframe, dk) - filtered_dataframe = dk.normalize_data_from_metadata(filtered_dataframe) - dk.data_dictionary["prediction_features"] = filtered_dataframe - - # optional additional data cleaning/analysis - self.data_cleaning_predict(dk) + dk.data_dictionary["prediction_features"], outliers, _ = dk.feature_pipeline.transform( + dk.data_dictionary["prediction_features"], outlier_check=True) pred_df = self.rl_model_predict( dk.data_dictionary["prediction_features"], dk, self.model) pred_df.fillna(0, inplace=True) + if self.freqai_info.get("DI_threshold", 0) > 0: + dk.DI_values = dk.feature_pipeline["di"].di_values + dk.do_predict = outliers.to_numpy() + return (pred_df, dk.do_predict) def rl_model_predict(self, dataframe: DataFrame, diff --git a/freqtrade/freqai/base_models/BaseClassifierModel.py b/freqtrade/freqai/base_models/BaseClassifierModel.py index ffd42dd1d..179e8a5af 100644 --- a/freqtrade/freqai/base_models/BaseClassifierModel.py +++ b/freqtrade/freqai/base_models/BaseClassifierModel.py @@ -50,21 +50,30 @@ class BaseClassifierModel(IFreqaiModel): logger.info(f"-------------------- Training on data from {start_date} to " f"{end_date} --------------------") # split data into train/test data. - data_dictionary = dk.make_train_test_datasets(features_filtered, labels_filtered) + d = dk.make_train_test_datasets(features_filtered, labels_filtered) if not self.freqai_info.get("fit_live_predictions_candles", 0) or not self.live: dk.fit_labels() - # normalize all data based on train_dataset only - data_dictionary = dk.normalize_data(data_dictionary) + self.define_data_pipeline(dk) + self.define_label_pipeline(dk) - # optional additional data cleaning/analysis - self.data_cleaning_train(dk) + (d["train_features"], + d["train_labels"], + d["train_weights"]) = dk.feature_pipeline.fit_transform(d["train_features"], + d["train_labels"], + d["train_weights"]) + + (d["test_features"], + d["test_labels"], + d["test_weights"]) = dk.feature_pipeline.transform(d["test_features"], + d["test_labels"], + d["test_weights"]) logger.info( f"Training model on {len(dk.data_dictionary['train_features'].columns)} features" ) - logger.info(f"Training model on {len(data_dictionary['train_features'])} data points") + logger.info(f"Training model on {len(d['train_features'])} data points") - model = self.fit(data_dictionary, dk) + model = self.fit(d, dk) end_time = time() @@ -89,10 +98,11 @@ class BaseClassifierModel(IFreqaiModel): filtered_df, _ = dk.filter_features( unfiltered_df, dk.training_features_list, training_filter=False ) - filtered_df = dk.normalize_data_from_metadata(filtered_df) + dk.data_dictionary["prediction_features"] = filtered_df - self.data_cleaning_predict(dk) + dk.data_dictionary["prediction_features"], outliers, _ = dk.feature_pipeline.transform( + dk.data_dictionary["prediction_features"], outlier_check=True) predictions = self.model.predict(dk.data_dictionary["prediction_features"]) if self.CONV_WIDTH == 1: @@ -107,4 +117,10 @@ class BaseClassifierModel(IFreqaiModel): pred_df = pd.concat([pred_df, pred_df_prob], axis=1) + if self.freqai_info.get("DI_threshold", 0) > 0: + dk.DI_values = dk.feature_pipeline["di"].di_values + else: + dk.DI_values = np.zeros(len(outliers.index)) + dk.do_predict = outliers.to_numpy() + return (pred_df, dk.do_predict) diff --git a/freqtrade/freqai/base_models/BasePyTorchClassifier.py b/freqtrade/freqai/base_models/BasePyTorchClassifier.py index 436294dcc..448384852 100644 --- a/freqtrade/freqai/base_models/BasePyTorchClassifier.py +++ b/freqtrade/freqai/base_models/BasePyTorchClassifier.py @@ -1,5 +1,6 @@ import logging -from typing import Dict, List, Tuple +from time import time +from typing import Any, Dict, List, Tuple import numpy as np import numpy.typing as npt @@ -68,9 +69,12 @@ class BasePyTorchClassifier(BasePyTorchModel): filtered_df, _ = dk.filter_features( unfiltered_df, dk.training_features_list, training_filter=False ) - filtered_df = dk.normalize_data_from_metadata(filtered_df) + dk.data_dictionary["prediction_features"] = filtered_df - self.data_cleaning_predict(dk) + + dk.data_dictionary["prediction_features"], outliers, _ = dk.feature_pipeline.transform( + dk.data_dictionary["prediction_features"], outlier_check=True) + x = self.data_convertor.convert_x( dk.data_dictionary["prediction_features"], device=self.device @@ -85,6 +89,13 @@ class BasePyTorchClassifier(BasePyTorchModel): pred_df_prob = DataFrame(probs.detach().tolist(), columns=class_names) pred_df = DataFrame(predicted_classes_str, columns=[dk.label_list[0]]) pred_df = pd.concat([pred_df, pred_df_prob], axis=1) + + if self.freqai_info.get("DI_threshold", 0) > 0: + dk.DI_values = dk.feature_pipeline["di"].di_values + else: + dk.DI_values = np.zeros(len(outliers.index)) + dk.do_predict = outliers.to_numpy() + return (pred_df, dk.do_predict) def encode_class_names( @@ -149,3 +160,58 @@ class BasePyTorchClassifier(BasePyTorchModel): ) return self.class_names + + def train( + self, unfiltered_df: DataFrame, pair: str, dk: FreqaiDataKitchen, **kwargs + ) -> Any: + """ + Filter the training data and train a model to it. Train makes heavy use of the datakitchen + for storing, saving, loading, and analyzing the data. + :param unfiltered_df: Full dataframe for the current training period + :return: + :model: Trained model which can be used to inference (self.predict) + """ + + logger.info(f"-------------------- Starting training {pair} --------------------") + + start_time = time() + + features_filtered, labels_filtered = dk.filter_features( + unfiltered_df, + dk.training_features_list, + dk.label_list, + training_filter=True, + ) + + # split data into train/test data. + d = dk.make_train_test_datasets(features_filtered, labels_filtered) + if not self.freqai_info.get("fit_live_predictions", 0) or not self.live: + dk.fit_labels() + + d["train_labels"], _, _ = dk.label_pipeline.fit_transform(d["train_labels"]) + d["test_labels"], _, _ = dk.label_pipeline.transform(d["test_labels"]) + + (d["train_features"], + d["train_labels"], + d["train_weights"]) = dk.feature_pipeline.fit_transform(d["train_features"], + d["train_labels"], + d["train_weights"]) + + (d["test_features"], + d["test_labels"], + d["test_weights"]) = dk.feature_pipeline.transform(d["test_features"], + d["test_labels"], + d["test_weights"]) + + logger.info( + f"Training model on {len(dk.data_dictionary['train_features'].columns)} features" + ) + logger.info(f"Training model on {len(d['train_features'])} data points") + + model = self.fit(d, dk) + end_time = time() + + logger.info(f"-------------------- Done training {pair} " + f"({end_time - start_time:.2f} secs) --------------------") + + return model diff --git a/freqtrade/freqai/base_models/BasePyTorchModel.py b/freqtrade/freqai/base_models/BasePyTorchModel.py index 21dc4e894..71369a146 100644 --- a/freqtrade/freqai/base_models/BasePyTorchModel.py +++ b/freqtrade/freqai/base_models/BasePyTorchModel.py @@ -1,21 +1,16 @@ import logging from abc import ABC, abstractmethod -from time import time -from typing import Any import torch -from pandas import DataFrame -from freqtrade.freqai.data_kitchen import FreqaiDataKitchen -# from freqtrade.freqai.freqai_interface import IFreqaiModel -from freqtrade.freqai.base_models import BaseRegressionModel +from freqtrade.freqai.freqai_interface import IFreqaiModel from freqtrade.freqai.torch.PyTorchDataConvertor import PyTorchDataConvertor logger = logging.getLogger(__name__) -class BasePyTorchModel(BaseRegressionModel): +class BasePyTorchModel(IFreqaiModel, ABC): """ Base class for PyTorch type models. User *must* inherit from this class and set fit() and predict() and @@ -30,51 +25,6 @@ class BasePyTorchModel(BaseRegressionModel): self.splits = ["train", "test"] if test_size != 0 else ["train"] self.window_size = self.freqai_info.get("conv_width", 1) - # def train( - # self, unfiltered_df: DataFrame, pair: str, dk: FreqaiDataKitchen, **kwargs - # ) -> Any: - # """ - # Filter the training data and train a model to it. Train makes heavy use of the datakitchen - # for storing, saving, loading, and analyzing the data. - # :param unfiltered_df: Full dataframe for the current training period - # :return: - # :model: Trained model which can be used to inference (self.predict) - # """ - - # logger.info(f"-------------------- Starting training {pair} --------------------") - - # start_time = time() - - # features_filtered, labels_filtered = dk.filter_features( - # unfiltered_df, - # dk.training_features_list, - # dk.label_list, - # training_filter=True, - # ) - - # # split data into train/test data. - # data_dictionary = dk.make_train_test_datasets(features_filtered, labels_filtered) - # if not self.freqai_info.get("fit_live_predictions", 0) or not self.live: - # dk.fit_labels() - # # normalize all data based on train_dataset only - # data_dictionary = dk.normalize_data(data_dictionary) - - # # optional additional data cleaning/analysis - # self.data_cleaning_train(dk) - - # logger.info( - # f"Training model on {len(dk.data_dictionary['train_features'].columns)} features" - # ) - # logger.info(f"Training model on {len(data_dictionary['train_features'])} data points") - - # model = self.fit(data_dictionary, dk) - # end_time = time() - - # logger.info(f"-------------------- Done training {pair} " - # f"({end_time - start_time:.2f} secs) --------------------") - - # return model - @property @abstractmethod def data_convertor(self) -> PyTorchDataConvertor: diff --git a/freqtrade/freqai/base_models/BasePyTorchRegressor.py b/freqtrade/freqai/base_models/BasePyTorchRegressor.py index 6139f2e85..2f2aaef39 100644 --- a/freqtrade/freqai/base_models/BasePyTorchRegressor.py +++ b/freqtrade/freqai/base_models/BasePyTorchRegressor.py @@ -1,5 +1,6 @@ import logging -from typing import Tuple +from time import time +from typing import Any, Tuple import numpy as np import numpy.typing as npt @@ -36,10 +37,11 @@ class BasePyTorchRegressor(BasePyTorchModel): filtered_df, _ = dk.filter_features( unfiltered_df, dk.training_features_list, training_filter=False ) - filtered_df = dk.normalize_data_from_metadata(filtered_df) dk.data_dictionary["prediction_features"] = filtered_df - self.data_cleaning_predict(dk) + dk.data_dictionary["prediction_features"], outliers, _ = dk.feature_pipeline.transform( + dk.data_dictionary["prediction_features"], outlier_check=True) + x = self.data_convertor.convert_x( dk.data_dictionary["prediction_features"], device=self.device @@ -47,5 +49,69 @@ class BasePyTorchRegressor(BasePyTorchModel): self.model.model.eval() y = self.model.model(x) pred_df = DataFrame(y.detach().tolist(), columns=[dk.label_list[0]]) - pred_df = dk.denormalize_labels_from_metadata(pred_df) + pred_df, _, _ = dk.label_pipeline.inverse_transform(pred_df) + + if self.freqai_info.get("DI_threshold", 0) > 0: + dk.DI_values = dk.feature_pipeline["di"].di_values + else: + dk.DI_values = np.zeros(len(outliers.index)) + dk.do_predict = outliers.to_numpy() return (pred_df, dk.do_predict) + + def train( + self, unfiltered_df: DataFrame, pair: str, dk: FreqaiDataKitchen, **kwargs + ) -> Any: + """ + Filter the training data and train a model to it. Train makes heavy use of the datakitchen + for storing, saving, loading, and analyzing the data. + :param unfiltered_df: Full dataframe for the current training period + :return: + :model: Trained model which can be used to inference (self.predict) + """ + + logger.info(f"-------------------- Starting training {pair} --------------------") + + start_time = time() + + features_filtered, labels_filtered = dk.filter_features( + unfiltered_df, + dk.training_features_list, + dk.label_list, + training_filter=True, + ) + + # split data into train/test data. + d = dk.make_train_test_datasets(features_filtered, labels_filtered) + if not self.freqai_info.get("fit_live_predictions", 0) or not self.live: + dk.fit_labels() + + self.define_data_pipeline(dk) + self.define_label_pipeline(dk) + + d["train_labels"], _, _ = dk.label_pipeline.fit_transform(d["train_labels"]) + d["test_labels"], _, _ = dk.label_pipeline.transform(d["test_labels"]) + + (d["train_features"], + d["train_labels"], + d["train_weights"]) = dk.feature_pipeline.fit_transform(d["train_features"], + d["train_labels"], + d["train_weights"]) + + (d["test_features"], + d["test_labels"], + d["test_weights"]) = dk.feature_pipeline.transform(d["test_features"], + d["test_labels"], + d["test_weights"]) + + logger.info( + f"Training model on {len(dk.data_dictionary['train_features'].columns)} features" + ) + logger.info(f"Training model on {len(d['train_features'])} data points") + + model = self.fit(d, dk) + end_time = time() + + logger.info(f"-------------------- Done training {pair} " + f"({end_time - start_time:.2f} secs) --------------------") + + return model diff --git a/freqtrade/freqai/base_models/BaseRegressionModel.py b/freqtrade/freqai/base_models/BaseRegressionModel.py index 45660253e..1babd5f0c 100644 --- a/freqtrade/freqai/base_models/BaseRegressionModel.py +++ b/freqtrade/freqai/base_models/BaseRegressionModel.py @@ -56,20 +56,20 @@ class BaseRegressionModel(IFreqaiModel): self.define_data_pipeline(dk) self.define_label_pipeline(dk) - d["train_labels"], _, _ = dk.label_pipeline.fit_transform(d["train_labels"]) - d["test_labels"], _, _ = dk.label_pipeline.transform(d["test_labels"]) - (d["train_features"], d["train_labels"], - d["train_weights"]) = dk.pipeline.fit_transform(d["train_features"], - d["train_labels"], - d["train_weights"]) + d["train_weights"]) = dk.feature_pipeline.fit_transform(d["train_features"], + d["train_labels"], + d["train_weights"]) (d["test_features"], d["test_labels"], - d["test_weights"]) = dk.pipeline.transform(d["test_features"], - d["test_labels"], - d["test_weights"]) + d["test_weights"]) = dk.feature_pipeline.transform(d["test_features"], + d["test_labels"], + d["test_weights"]) + + d["train_labels"], _, _ = dk.label_pipeline.fit_transform(d["train_labels"]) + d["test_labels"], _, _ = dk.label_pipeline.transform(d["test_labels"]) logger.info( f"Training model on {len(dk.data_dictionary['train_features'].columns)} features" @@ -98,13 +98,11 @@ class BaseRegressionModel(IFreqaiModel): """ dk.find_features(unfiltered_df) - filtered_df, _ = dk.filter_features( + dk.data_dictionary["prediction_features"], _ = dk.filter_features( unfiltered_df, dk.training_features_list, training_filter=False ) - # filtered_df = dk.normalize_data_from_metadata(filtered_df) - dk.data_dictionary["prediction_features"] = filtered_df - dk.data_dictionary["prediction_features"], outliers, _ = dk.pipeline.transform( + dk.data_dictionary["prediction_features"], outliers, _ = dk.feature_pipeline.transform( dk.data_dictionary["prediction_features"], outlier_check=True) predictions = self.model.predict(dk.data_dictionary["prediction_features"]) @@ -114,7 +112,10 @@ class BaseRegressionModel(IFreqaiModel): pred_df = DataFrame(predictions, columns=dk.label_list) pred_df, _, _ = dk.label_pipeline.inverse_transform(pred_df) - dk.DI_values = dk.label_pipeline.get_step("di").di_values + if self.freqai_info.get("DI_threshold", 0) > 0: + dk.DI_values = dk.feature_pipeline["di"].di_values + else: + dk.DI_values = np.zeros(len(outliers.index)) dk.do_predict = outliers.to_numpy() return (pred_df, dk.do_predict) diff --git a/freqtrade/freqai/data_drawer.py b/freqtrade/freqai/data_drawer.py index 9fdcc2d41..670dfc620 100644 --- a/freqtrade/freqai/data_drawer.py +++ b/freqtrade/freqai/data_drawer.py @@ -449,9 +449,6 @@ class FreqaiDataDrawer: elif self.model_type in ["stable_baselines3", "sb3_contrib", "pytorch"]: model.save(save_path / f"{dk.model_filename}_model.zip") - if dk.svm_model is not None: - dump(dk.svm_model, save_path / f"{dk.model_filename}_svm_model.joblib") - dk.data["data_path"] = str(dk.data_path) dk.data["model_filename"] = str(dk.model_filename) dk.data["training_features_list"] = dk.training_features_list @@ -461,8 +458,8 @@ class FreqaiDataDrawer: rapidjson.dump(dk.data, fp, default=self.np_encoder, number_mode=rapidjson.NM_NATIVE) # save the pipelines to pickle files - with (save_path / f"{dk.model_filename}_pipeline.pkl").open("wb") as fp: - cloudpickle.dump(dk.pipeline, fp) + with (save_path / f"{dk.model_filename}_feature_pipeline.pkl").open("wb") as fp: + cloudpickle.dump(dk.feature_pipeline, fp) with (save_path / f"{dk.model_filename}_label_pipeline.pkl").open("wb") as fp: cloudpickle.dump(dk.label_pipeline, fp) @@ -476,11 +473,6 @@ class FreqaiDataDrawer: save_path / f"{dk.model_filename}_trained_dates_df.pkl" ) - if self.freqai_info["feature_parameters"].get("principal_component_analysis"): - cloudpickle.dump( - dk.pca, (dk.data_path / f"{dk.model_filename}_pca_object.pkl").open("wb") - ) - self.model_dictionary[coin] = model self.pair_dict[coin]["model_filename"] = dk.model_filename self.pair_dict[coin]["data_path"] = str(dk.data_path) @@ -489,7 +481,7 @@ class FreqaiDataDrawer: self.meta_data_dictionary[coin] = {} self.meta_data_dictionary[coin]["train_df"] = dk.data_dictionary["train_features"] self.meta_data_dictionary[coin]["meta_data"] = dk.data - self.meta_data_dictionary[coin]["pipeline"] = dk.pipeline + self.meta_data_dictionary[coin]["feature_pipeline"] = dk.feature_pipeline self.meta_data_dictionary[coin]["label_pipeline"] = dk.label_pipeline self.save_drawer_to_disk() @@ -522,7 +514,7 @@ class FreqaiDataDrawer: if coin in self.meta_data_dictionary: dk.data = self.meta_data_dictionary[coin]["meta_data"] dk.data_dictionary["train_features"] = self.meta_data_dictionary[coin]["train_df"] - dk.pipeline = self.meta_data_dictionary[coin]["pipeline"] + dk.feature_pipeline = self.meta_data_dictionary[coin]["feature_pipeline"] dk.label_pipeline = self.meta_data_dictionary[coin]["label_pipeline"] else: with (dk.data_path / f"{dk.model_filename}_metadata.json").open("r") as fp: @@ -532,7 +524,7 @@ class FreqaiDataDrawer: dk.data_path / f"{dk.model_filename}_trained_df.pkl" ) with (dk.data_path / f"{dk.model_filename}_pipeline.pkl").open("rb") as fp: - dk.pipeline = cloudpickle.load(fp) + dk.feature_pipeline = cloudpickle.load(fp) with (dk.data_path / f"{dk.model_filename}_label_pipeline.pkl").open("rb") as fp: dk.label_pipeline = cloudpickle.load(fp) @@ -544,9 +536,6 @@ class FreqaiDataDrawer: model = self.model_dictionary[coin] elif self.model_type == 'joblib': model = load(dk.data_path / f"{dk.model_filename}_model.joblib") - elif self.model_type == 'keras': - from tensorflow import keras - model = keras.models.load_model(dk.data_path / f"{dk.model_filename}_model.h5") elif 'stable_baselines' in self.model_type or 'sb3_contrib' == self.model_type: mod = importlib.import_module( self.model_type, self.freqai_info['rl_config']['model_type']) @@ -558,9 +547,6 @@ class FreqaiDataDrawer: model = zip["pytrainer"] model = model.load_from_checkpoint(zip) - if Path(dk.data_path / f"{dk.model_filename}_svm_model.joblib").is_file(): - dk.svm_model = load(dk.data_path / f"{dk.model_filename}_svm_model.joblib") - if not model: raise OperationalException( f"Unable to load model, ensure model exists at " f"{dk.data_path} " @@ -570,11 +556,6 @@ class FreqaiDataDrawer: if coin not in self.model_dictionary: self.model_dictionary[coin] = model - if self.config["freqai"]["feature_parameters"]["principal_component_analysis"]: - dk.pca = cloudpickle.load( - (dk.data_path / f"{dk.model_filename}_pca_object.pkl").open("rb") - ) - return model def update_historic_data(self, strategy: IStrategy, dk: FreqaiDataKitchen) -> None: diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index adfeb8dd5..04182dc69 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -4,7 +4,6 @@ import logging import random import shutil from datetime import datetime, timezone -from math import cos, sin from pathlib import Path from typing import Any, Dict, List, Optional, Tuple @@ -12,13 +11,10 @@ import numpy as np import numpy.typing as npt import pandas as pd import psutil +from datasieve.pipeline import Pipeline from pandas import DataFrame -from scipy import stats -from sklearn import linear_model -from sklearn.cluster import DBSCAN from sklearn.metrics.pairwise import pairwise_distances from sklearn.model_selection import train_test_split -from sklearn.neighbors import NearestNeighbors from freqtrade.configuration import TimeRange from freqtrade.constants import Config @@ -27,7 +23,6 @@ from freqtrade.exceptions import OperationalException from freqtrade.exchange import timeframe_to_seconds from freqtrade.strategy import merge_informative_pair from freqtrade.strategy.interface import IStrategy -from datasieve.pipeline import Pipeline SECONDS_IN_DAY = 86400 @@ -83,11 +78,11 @@ class FreqaiDataKitchen: self.live = live self.pair = pair - self.svm_model: linear_model.SGDOneClassSVM = None + # self.svm_model: linear_model.SGDOneClassSVM = None self.keras: bool = self.freqai_config.get("keras", False) self.set_all_pairs() self.backtest_live_models = config.get("freqai_backtest_live_models", False) - self.pipeline = Pipeline() + self.feature_pipeline = Pipeline() self.label_pipeline = Pipeline() if not self.live: @@ -230,13 +225,14 @@ class FreqaiDataKitchen: drop_index = pd.isnull(filtered_df).any(axis=1) # get the rows that have NaNs, drop_index = drop_index.replace(True, 1).replace(False, 0) # pep8 requirement. if (training_filter): - const_cols = list((filtered_df.nunique() == 1).loc[lambda x: x].index) - if const_cols: - filtered_df = filtered_df.filter(filtered_df.columns.difference(const_cols)) - self.data['constant_features_list'] = const_cols - logger.warning(f"Removed features {const_cols} with constant values.") - else: - self.data['constant_features_list'] = [] + # const_cols = list((filtered_df.nunique() == 1).loc[lambda x: x].index) + # if const_cols: + # filtered_df = filtered_df.filter(filtered_df.columns.difference(const_cols)) + # self.data['constant_features_list'] = const_cols + # logger.warning(f"Removed features {const_cols} with constant values.") + # else: + # self.data['constant_features_list'] = [] + # we don't care about total row number (total no. datapoints) in training, we only care # about removing any row with NaNs # if labels has multiple columns (user wants to train multiple modelEs), we detect here @@ -267,8 +263,10 @@ class FreqaiDataKitchen: self.data["filter_drop_index_training"] = drop_index else: - if 'constant_features_list' in self.data and len(self.data['constant_features_list']): - filtered_df = self.check_pred_labels(filtered_df) + + # if 'constant_features_list' in self.data and len(self.data['constant_features_list']): + # filtered_df = self.check_pred_labels(filtered_df) + # we are backtesting so we need to preserve row number to send back to strategy, # so now we use do_predict to avoid any prediction based on a NaN drop_index = pd.isnull(filtered_df).any(axis=1) @@ -488,415 +486,6 @@ class FreqaiDataKitchen: return df - def check_pred_labels(self, df_predictions: DataFrame) -> DataFrame: - """ - Check that prediction feature labels match training feature labels. - :param df_predictions: incoming predictions - """ - constant_labels = self.data['constant_features_list'] - df_predictions = df_predictions.filter( - df_predictions.columns.difference(constant_labels) - ) - logger.warning( - f"Removed {len(constant_labels)} features from prediction features, " - f"these were considered constant values during most recent training." - ) - - return df_predictions - - # def principal_component_analysis(self) -> None: - # """ - # Performs Principal Component Analysis on the data for dimensionality reduction - # and outlier detection (see self.remove_outliers()) - # No parameters or returns, it acts on the data_dictionary held by the DataHandler. - # """ - - # from sklearn.decomposition import PCA # avoid importing if we dont need it - - # pca = PCA(0.999) - # pca = pca.fit(self.data_dictionary["train_features"]) - # n_keep_components = pca.n_components_ - # self.data["n_kept_components"] = n_keep_components - # n_components = self.data_dictionary["train_features"].shape[1] - # logger.info("reduced feature dimension by %s", n_components - n_keep_components) - # logger.info("explained variance %f", np.sum(pca.explained_variance_ratio_)) - - # train_components = pca.transform(self.data_dictionary["train_features"]) - # self.data_dictionary["train_features"] = pd.DataFrame( - # data=train_components, - # columns=["PC" + str(i) for i in range(0, n_keep_components)], - # index=self.data_dictionary["train_features"].index, - # ) - # # normalsing transformed training features - # self.data_dictionary["train_features"] = self.normalize_single_dataframe( - # self.data_dictionary["train_features"]) - - # # keeping a copy of the non-transformed features so we can check for errors during - # # model load from disk - # self.data["training_features_list_raw"] = copy.deepcopy(self.training_features_list) - # self.training_features_list = self.data_dictionary["train_features"].columns - - # if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0: - # test_components = pca.transform(self.data_dictionary["test_features"]) - # self.data_dictionary["test_features"] = pd.DataFrame( - # data=test_components, - # columns=["PC" + str(i) for i in range(0, n_keep_components)], - # index=self.data_dictionary["test_features"].index, - # ) - # # normalise transformed test feature to transformed training features - # self.data_dictionary["test_features"] = self.normalize_data_from_metadata( - # self.data_dictionary["test_features"]) - - # self.data["n_kept_components"] = n_keep_components - # self.pca = pca - - # logger.info(f"PCA reduced total features from {n_components} to {n_keep_components}") - - # if not self.data_path.is_dir(): - # self.data_path.mkdir(parents=True, exist_ok=True) - - # return None - - # def pca_transform(self, filtered_dataframe: DataFrame) -> None: - # """ - # Use an existing pca transform to transform data into components - # :param filtered_dataframe: DataFrame = the cleaned dataframe - # """ - # pca_components = self.pca.transform(filtered_dataframe) - # self.data_dictionary["prediction_features"] = pd.DataFrame( - # data=pca_components, - # columns=["PC" + str(i) for i in range(0, self.data["n_kept_components"])], - # index=filtered_dataframe.index, - # ) - # # normalise transformed predictions to transformed training features - # self.data_dictionary["prediction_features"] = self.normalize_data_from_metadata( - # self.data_dictionary["prediction_features"]) - - # def compute_distances(self) -> float: - # """ - # Compute distances between each training point and every other training - # point. This metric defines the neighborhood of trained data and is used - # for prediction confidence in the Dissimilarity Index - # """ - # # logger.info("computing average mean distance for all training points") - # pairwise = pairwise_distances( - # self.data_dictionary["train_features"], n_jobs=self.thread_count) - # # remove the diagonal distances which are itself distances ~0 - # np.fill_diagonal(pairwise, np.NaN) - # pairwise = pairwise.reshape(-1, 1) - # avg_mean_dist = pairwise[~np.isnan(pairwise)].mean() - - # return avg_mean_dist - - # def get_outlier_percentage(self, dropped_pts: npt.NDArray) -> float: - # """ - # Check if more than X% of points werer dropped during outlier detection. - # """ - # outlier_protection_pct = self.freqai_config["feature_parameters"].get( - # "outlier_protection_percentage", 30) - # outlier_pct = (dropped_pts.sum() / len(dropped_pts)) * 100 - # if outlier_pct >= outlier_protection_pct: - # return outlier_pct - # else: - # return 0.0 - - # def use_SVM_to_remove_outliers(self, predict: bool) -> None: - # """ - # Build/inference a Support Vector Machine to detect outliers - # in training data and prediction - # :param predict: bool = If true, inference an existing SVM model, else construct one - # """ - - # if self.keras: - # logger.warning( - # "SVM outlier removal not currently supported for Keras based models. " - # "Skipping user requested function." - # ) - # if predict: - # self.do_predict = np.ones(len(self.data_dictionary["prediction_features"])) - # return - - # if predict: - # if not self.svm_model: - # logger.warning("No svm model available for outlier removal") - # return - # y_pred = self.svm_model.predict(self.data_dictionary["prediction_features"]) - # do_predict = np.where(y_pred == -1, 0, y_pred) - - # if (len(do_predict) - do_predict.sum()) > 0: - # logger.info(f"SVM tossed {len(do_predict) - do_predict.sum()} predictions.") - # self.do_predict += do_predict - # self.do_predict -= 1 - - # else: - # # use SGDOneClassSVM to increase speed? - # svm_params = self.freqai_config["feature_parameters"].get( - # "svm_params", {"shuffle": False, "nu": 0.1}) - # self.svm_model = linear_model.SGDOneClassSVM(**svm_params).fit( - # self.data_dictionary["train_features"] - # ) - # y_pred = self.svm_model.predict(self.data_dictionary["train_features"]) - # kept_points = np.where(y_pred == -1, 0, y_pred) - # # keep_index = np.where(y_pred == 1) - # outlier_pct = self.get_outlier_percentage(1 - kept_points) - # if outlier_pct: - # logger.warning( - # f"SVM detected {outlier_pct:.2f}% of the points as outliers. " - # f"Keeping original dataset." - # ) - # self.svm_model = None - # return - - # self.data_dictionary["train_features"] = self.data_dictionary["train_features"][ - # (y_pred == 1) - # ] - # self.data_dictionary["train_labels"] = self.data_dictionary["train_labels"][ - # (y_pred == 1) - # ] - # self.data_dictionary["train_weights"] = self.data_dictionary["train_weights"][ - # (y_pred == 1) - # ] - - # logger.info( - # f"SVM tossed {len(y_pred) - kept_points.sum()}" - # f" train points from {len(y_pred)} total points." - # ) - - # # same for test data - # # TODO: This (and the part above) could be refactored into a separate function - # # to reduce code duplication - # if self.freqai_config['data_split_parameters'].get('test_size', 0.1) != 0: - # y_pred = self.svm_model.predict(self.data_dictionary["test_features"]) - # kept_points = np.where(y_pred == -1, 0, y_pred) - # self.data_dictionary["test_features"] = self.data_dictionary["test_features"][ - # (y_pred == 1) - # ] - # self.data_dictionary["test_labels"] = self.data_dictionary["test_labels"][( - # y_pred == 1)] - # self.data_dictionary["test_weights"] = self.data_dictionary["test_weights"][ - # (y_pred == 1) - # ] - - # logger.info( - # f"{self.pair}: SVM tossed {len(y_pred) - kept_points.sum()}" - # f" test points from {len(y_pred)} total points." - # ) - - # return - - # def use_DBSCAN_to_remove_outliers(self, predict: bool, eps=None) -> None: - # """ - # Use DBSCAN to cluster training data and remove "noisy" data (read outliers). - # User controls this via the config param `DBSCAN_outlier_pct` which indicates the - # pct of training data that they want to be considered outliers. - # :param predict: bool = If False (training), iterate to find the best hyper parameters - # to match user requested outlier percent target. - # If True (prediction), use the parameters determined from - # the previous training to estimate if the current prediction point - # is an outlier. - # """ - - # if predict: - # if not self.data['DBSCAN_eps']: - # return - # train_ft_df = self.data_dictionary['train_features'] - # pred_ft_df = self.data_dictionary['prediction_features'] - # num_preds = len(pred_ft_df) - # df = pd.concat([train_ft_df, pred_ft_df], axis=0, ignore_index=True) - # clustering = DBSCAN(eps=self.data['DBSCAN_eps'], - # min_samples=self.data['DBSCAN_min_samples'], - # n_jobs=self.thread_count - # ).fit(df) - # do_predict = np.where(clustering.labels_[-num_preds:] == -1, 0, 1) - - # if (len(do_predict) - do_predict.sum()) > 0: - # logger.info(f"DBSCAN tossed {len(do_predict) - do_predict.sum()} predictions") - # self.do_predict += do_predict - # self.do_predict -= 1 - - # else: - - # def normalise_distances(distances): - # normalised_distances = (distances - distances.min()) / \ - # (distances.max() - distances.min()) - # return normalised_distances - - # def rotate_point(origin, point, angle): - # # rotate a point counterclockwise by a given angle (in radians) - # # around a given origin - # x = origin[0] + cos(angle) * (point[0] - origin[0]) - \ - # sin(angle) * (point[1] - origin[1]) - # y = origin[1] + sin(angle) * (point[0] - origin[0]) + \ - # cos(angle) * (point[1] - origin[1]) - # return (x, y) - - # MinPts = int(len(self.data_dictionary['train_features'].index) * 0.25) - # # measure pairwise distances to nearest neighbours - # neighbors = NearestNeighbors( - # n_neighbors=MinPts, n_jobs=self.thread_count) - # neighbors_fit = neighbors.fit(self.data_dictionary['train_features']) - # distances, _ = neighbors_fit.kneighbors(self.data_dictionary['train_features']) - # distances = np.sort(distances, axis=0).mean(axis=1) - - # normalised_distances = normalise_distances(distances) - # x_range = np.linspace(0, 1, len(distances)) - # line = np.linspace(normalised_distances[0], - # normalised_distances[-1], len(normalised_distances)) - # deflection = np.abs(normalised_distances - line) - # max_deflection_loc = np.where(deflection == deflection.max())[0][0] - # origin = x_range[max_deflection_loc], line[max_deflection_loc] - # point = x_range[max_deflection_loc], normalised_distances[max_deflection_loc] - # rot_angle = np.pi / 4 - # elbow_loc = rotate_point(origin, point, rot_angle) - - # epsilon = elbow_loc[1] * (distances[-1] - distances[0]) + distances[0] - - # clustering = DBSCAN(eps=epsilon, min_samples=MinPts, - # n_jobs=int(self.thread_count)).fit( - # self.data_dictionary['train_features'] - # ) - - # logger.info(f'DBSCAN found eps of {epsilon:.2f}.') - - # self.data['DBSCAN_eps'] = epsilon - # self.data['DBSCAN_min_samples'] = MinPts - # dropped_points = np.where(clustering.labels_ == -1, 1, 0) - - # outlier_pct = self.get_outlier_percentage(dropped_points) - # if outlier_pct: - # logger.warning( - # f"DBSCAN detected {outlier_pct:.2f}% of the points as outliers. " - # f"Keeping original dataset." - # ) - # self.data['DBSCAN_eps'] = 0 - # return - - # self.data_dictionary['train_features'] = self.data_dictionary['train_features'][ - # (clustering.labels_ != -1) - # ] - # self.data_dictionary["train_labels"] = self.data_dictionary["train_labels"][ - # (clustering.labels_ != -1) - # ] - # self.data_dictionary["train_weights"] = self.data_dictionary["train_weights"][ - # (clustering.labels_ != -1) - # ] - - # logger.info( - # f"DBSCAN tossed {dropped_points.sum()}" - # f" train points from {len(clustering.labels_)}" - # ) - - # return - - # def compute_inlier_metric(self, set_='train') -> None: - # """ - # Compute inlier metric from backwards distance distributions. - # This metric defines how well features from a timepoint fit - # into previous timepoints. - # """ - - # def normalise(dataframe: DataFrame, key: str) -> DataFrame: - # if set_ == 'train': - # min_value = dataframe.min() - # max_value = dataframe.max() - # self.data[f'{key}_min'] = min_value - # self.data[f'{key}_max'] = max_value - # else: - # min_value = self.data[f'{key}_min'] - # max_value = self.data[f'{key}_max'] - # return (dataframe - min_value) / (max_value - min_value) - - # no_prev_pts = self.freqai_config["feature_parameters"]["inlier_metric_window"] - - # if set_ == 'train': - # compute_df = copy.deepcopy(self.data_dictionary['train_features']) - # elif set_ == 'test': - # compute_df = copy.deepcopy(self.data_dictionary['test_features']) - # else: - # compute_df = copy.deepcopy(self.data_dictionary['prediction_features']) - - # compute_df_reindexed = compute_df.reindex( - # index=np.flip(compute_df.index) - # ) - - # pairwise = pd.DataFrame( - # np.triu( - # pairwise_distances(compute_df_reindexed, n_jobs=self.thread_count) - # ), - # columns=compute_df_reindexed.index, - # index=compute_df_reindexed.index - # ) - # pairwise = pairwise.round(5) - - # column_labels = [ - # '{}{}'.format('d', i) for i in range(1, no_prev_pts + 1) - # ] - # distances = pd.DataFrame( - # columns=column_labels, index=compute_df.index - # ) - - # for index in compute_df.index[no_prev_pts:]: - # current_row = pairwise.loc[[index]] - # current_row_no_zeros = current_row.loc[ - # :, (current_row != 0).any(axis=0) - # ] - # distances.loc[[index]] = current_row_no_zeros.iloc[ - # :, :no_prev_pts - # ] - # distances = distances.replace([np.inf, -np.inf], np.nan) - # drop_index = pd.isnull(distances).any(axis=1) - # distances = distances[drop_index == 0] - - # inliers = pd.DataFrame(index=distances.index) - # for key in distances.keys(): - # current_distances = distances[key].dropna() - # current_distances = normalise(current_distances, key) - # if set_ == 'train': - # fit_params = stats.weibull_min.fit(current_distances) - # self.data[f'{key}_fit_params'] = fit_params - # else: - # fit_params = self.data[f'{key}_fit_params'] - # quantiles = stats.weibull_min.cdf(current_distances, *fit_params) - - # df_inlier = pd.DataFrame( - # {key: quantiles}, index=distances.index - # ) - # inliers = pd.concat( - # [inliers, df_inlier], axis=1 - # ) - - # inlier_metric = pd.DataFrame( - # data=inliers.sum(axis=1) / no_prev_pts, - # columns=['%-inlier_metric'], - # index=compute_df.index - # ) - - # inlier_metric = (2 * (inlier_metric - inlier_metric.min()) / - # (inlier_metric.max() - inlier_metric.min()) - 1) - - # if set_ in ('train', 'test'): - # inlier_metric = inlier_metric.iloc[no_prev_pts:] - # compute_df = compute_df.iloc[no_prev_pts:] - # self.remove_beginning_points_from_data_dict(set_, no_prev_pts) - # self.data_dictionary[f'{set_}_features'] = pd.concat( - # [compute_df, inlier_metric], axis=1) - # else: - # self.data_dictionary['prediction_features'] = pd.concat( - # [compute_df, inlier_metric], axis=1) - # self.data_dictionary['prediction_features'].fillna(0, inplace=True) - - # logger.info('Inlier metric computed and added to features.') - - # return None - - # def remove_beginning_points_from_data_dict(self, set_='train', no_prev_pts: int = 10): - # features = self.data_dictionary[f'{set_}_features'] - # weights = self.data_dictionary[f'{set_}_weights'] - # labels = self.data_dictionary[f'{set_}_labels'] - # self.data_dictionary[f'{set_}_weights'] = weights[no_prev_pts:] - # self.data_dictionary[f'{set_}_features'] = features.iloc[no_prev_pts:] - # self.data_dictionary[f'{set_}_labels'] = labels.iloc[no_prev_pts:] - def add_noise_to_training_features(self) -> None: """ Add noise to train features to reduce the risk of overfitting. diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py index cacbfea67..6dfa9855c 100644 --- a/freqtrade/freqai/freqai_interface.py +++ b/freqtrade/freqai/freqai_interface.py @@ -7,9 +7,11 @@ from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Literal, Optional, Tuple +import datasieve.transforms as ds import numpy as np import pandas as pd import psutil +from datasieve.pipeline import Pipeline from numpy.typing import NDArray from pandas import DataFrame @@ -23,8 +25,6 @@ from freqtrade.freqai.data_drawer import FreqaiDataDrawer from freqtrade.freqai.data_kitchen import FreqaiDataKitchen from freqtrade.freqai.utils import get_tb_logger, plot_feature_importance, record_params from freqtrade.strategy.interface import IStrategy -from datasieve.pipeline import Pipeline -import datasieve.transforms as ds pd.options.mode.chained_assignment = None @@ -505,94 +505,39 @@ class IFreqaiModel(ABC): "feature_engineering_* functions" ) - def data_cleaning_train(self, dk: FreqaiDataKitchen) -> None: - """ - Base data cleaning method for train. - Functions here improve/modify the input data by identifying outliers, - computing additional metrics, adding noise, reducing dimensionality etc. - """ - - ft_params = self.freqai_info["feature_parameters"] - - if ft_params.get('inlier_metric_window', 0): - dk.compute_inlier_metric(set_='train') - if self.freqai_info["data_split_parameters"]["test_size"] > 0: - dk.compute_inlier_metric(set_='test') - - if ft_params.get( - "principal_component_analysis", False - ): - dk.principal_component_analysis() - - if ft_params.get("use_SVM_to_remove_outliers", False): - dk.use_SVM_to_remove_outliers(predict=False) - - if ft_params.get("DI_threshold", 0): - dk.data["avg_mean_dist"] = dk.compute_distances() - - if ft_params.get("use_DBSCAN_to_remove_outliers", False): - if dk.pair in self.dd.old_DBSCAN_eps: - eps = self.dd.old_DBSCAN_eps[dk.pair] - else: - eps = None - dk.use_DBSCAN_to_remove_outliers(predict=False, eps=eps) - self.dd.old_DBSCAN_eps[dk.pair] = dk.data['DBSCAN_eps'] - - if self.freqai_info["feature_parameters"].get('noise_standard_deviation', 0): - dk.add_noise_to_training_features() - - def data_cleaning_predict(self, dk: FreqaiDataKitchen) -> None: - """ - Base data cleaning method for predict. - Functions here are complementary to the functions of data_cleaning_train. - """ - ft_params = self.freqai_info["feature_parameters"] - - # ensure user is feeding the correct indicators to the model - self.check_if_feature_list_matches_strategy(dk) - - if ft_params.get('inlier_metric_window', 0): - dk.compute_inlier_metric(set_='predict') - - if ft_params.get( - "principal_component_analysis", False - ): - dk.pca_transform(dk.data_dictionary['prediction_features']) - - if ft_params.get("use_SVM_to_remove_outliers", False): - dk.use_SVM_to_remove_outliers(predict=True) - - if ft_params.get("DI_threshold", 0): - dk.check_if_pred_in_training_spaces() - - if ft_params.get("use_DBSCAN_to_remove_outliers", False): - dk.use_DBSCAN_to_remove_outliers(predict=True) - def define_data_pipeline(self, dk: FreqaiDataKitchen) -> None: ft_params = self.freqai_info["feature_parameters"] - dk.pipeline = Pipeline([('scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1)))]) + dk.feature_pipeline = Pipeline( + [('scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1)))]) if ft_params.get("principal_component_analysis", False): - dk.pipeline.steps += [('pca', ds.DataSievePCA())] - dk.pipeline.steps += [('post-pca-scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1)))] + dk.feature_pipeline.steps += [('pca', ds.DataSievePCA())] + dk.feature_pipeline.steps += [('post-pca-scaler', + ds.DataSieveMinMaxScaler(feature_range=(-1, 1)))] if ft_params.get("use_SVM_to_remove_outliers", False): - dk.pipeline.steps += [('svm', ds.SVMOutlierExtractor())] + svm_params = ft_params.get( + "svm_params", {"shuffle": False, "nu": 0.01}) + dk.feature_pipeline.steps += [('svm', ds.SVMOutlierExtractor(**svm_params))] - if ft_params.get("DI_threshold", 0): - dk.pipeline.steps += [('di', ds.DissimilarityIndex())] + di = ft_params.get("DI_threshold", 0) + if di: + dk.feature_pipeline.steps += [('di', ds.DissimilarityIndex(di_threshold=di))] if ft_params.get("use_DBSCAN_to_remove_outliers", False): - dk.pipeline.steps += [('dbscan', ds.DataSieveDBSCAN())] + dk.feature_pipeline.steps += [('dbscan', ds.DataSieveDBSCAN())] - dk.pipeline.fitparams = dk.pipeline._validate_fitparams({}, dk.pipeline.steps) + dk.feature_pipeline.fitparams = dk.feature_pipeline._validate_fitparams( + {}, dk.feature_pipeline.steps) # if self.freqai_info["feature_parameters"].get('noise_standard_deviation', 0): # dk.pipeline.extend(('noise', ds.Noise())) def define_label_pipeline(self, dk: FreqaiDataKitchen) -> None: - dk.label_pipeline = Pipeline([('scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1)))]) + dk.label_pipeline = Pipeline([ + ('scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1))) + ]) def model_exists(self, dk: FreqaiDataKitchen) -> bool: """ diff --git a/freqtrade/freqai/prediction_models/PyTorchTransformerRegressor.py b/freqtrade/freqai/prediction_models/PyTorchTransformerRegressor.py index b3b684c14..bf78488ff 100644 --- a/freqtrade/freqai/prediction_models/PyTorchTransformerRegressor.py +++ b/freqtrade/freqai/prediction_models/PyTorchTransformerRegressor.py @@ -103,13 +103,13 @@ class PyTorchTransformerRegressor(BasePyTorchRegressor): """ dk.find_features(unfiltered_df) - filtered_df, _ = dk.filter_features( + dk.data_dictionary["prediction_features"], _ = dk.filter_features( unfiltered_df, dk.training_features_list, training_filter=False ) - filtered_df = dk.normalize_data_from_metadata(filtered_df) - dk.data_dictionary["prediction_features"] = filtered_df - self.data_cleaning_predict(dk) + dk.data_dictionary["prediction_features"], outliers, _ = dk.feature_pipeline.transform( + dk.data_dictionary["prediction_features"], outlier_check=True) + x = self.data_convertor.convert_x( dk.data_dictionary["prediction_features"], device=self.device @@ -131,7 +131,13 @@ class PyTorchTransformerRegressor(BasePyTorchRegressor): yb = yb.cpu().squeeze() pred_df = pd.DataFrame(yb.detach().numpy(), columns=dk.label_list) - pred_df = dk.denormalize_labels_from_metadata(pred_df) + pred_df, _, _ = dk.label_pipeline.inverse_transform(pred_df) + + if self.freqai_info.get("DI_threshold", 0) > 0: + dk.DI_values = dk.feature_pipeline["di"].di_values + else: + dk.DI_values = np.zeros(len(outliers.index)) + dk.do_predict = outliers.to_numpy() if x.shape[1] > 1: zeros_df = pd.DataFrame(np.zeros((x.shape[1] - len(pred_df), len(pred_df.columns))), diff --git a/freqtrade/freqai/prediction_models/XGBoostRFRegressor.py b/freqtrade/freqai/prediction_models/XGBoostRFRegressor.py index 1aefbf19a..f43585ab0 100644 --- a/freqtrade/freqai/prediction_models/XGBoostRFRegressor.py +++ b/freqtrade/freqai/prediction_models/XGBoostRFRegressor.py @@ -5,6 +5,7 @@ from xgboost import XGBRFRegressor from freqtrade.freqai.base_models.BaseRegressionModel import BaseRegressionModel from freqtrade.freqai.data_kitchen import FreqaiDataKitchen +from freqtrade.freqai.tensorboard import TBCallback logger = logging.getLogger(__name__) @@ -44,7 +45,10 @@ class XGBoostRFRegressor(BaseRegressionModel): model = XGBRFRegressor(**self.model_training_parameters) + model.set_params(callbacks=[TBCallback(dk.data_path)], activate=self.activate_tensorboard) model.fit(X=X, y=y, sample_weight=sample_weight, eval_set=eval_set, sample_weight_eval_set=eval_weights, xgb_model=xgb_model) + # set the callbacks to empty so that we can serialize to disk later + model.set_params(callbacks=[]) return model diff --git a/freqtrade/freqai/prediction_models/XGBoostRegressor.py b/freqtrade/freqai/prediction_models/XGBoostRegressor.py index f8b4d353d..88d348448 100644 --- a/freqtrade/freqai/prediction_models/XGBoostRegressor.py +++ b/freqtrade/freqai/prediction_models/XGBoostRegressor.py @@ -8,6 +8,9 @@ from freqtrade.freqai.data_kitchen import FreqaiDataKitchen from freqtrade.freqai.tensorboard import TBCallback +# from datasieve.pipeline import Pipeline +# from freqtrade.freqai.transforms import FreqaiQuantileTransformer + logger = logging.getLogger(__name__) @@ -52,3 +55,23 @@ class XGBoostRegressor(BaseRegressionModel): model.set_params(callbacks=[]) return model + + # def define_data_pipeline(self, dk: FreqaiDataKitchen) -> None: + # """ + # User defines their custom eature pipeline here (if they wish) + # """ + # dk.feature_pipeline = Pipeline([ + # ('qt', FreqaiQuantileTransformer(output_distribution='normal')) + # ]) + + # return + + # def define_label_pipeline(self, dk: FreqaiDataKitchen) -> None: + # """ + # User defines their custom label pipeline here (if they wish) + # """ + # dk.label_pipeline = Pipeline([ + # ('qt', FreqaiQuantileTransformer(output_distribution='normal')) + # ]) + + # return diff --git a/freqtrade/freqai/transforms/__init__.py b/freqtrade/freqai/transforms/__init__.py new file mode 100644 index 000000000..9b7d8ccf5 --- /dev/null +++ b/freqtrade/freqai/transforms/__init__.py @@ -0,0 +1,6 @@ +from freqtrade.freqai.transforms.quantile_transform import FreqaiQuantileTransformer + + +__all__ = ( + "FreqaiQuantileTransformer", +) diff --git a/freqtrade/freqai/transforms/quantile_transform.py b/freqtrade/freqai/transforms/quantile_transform.py new file mode 100644 index 000000000..3d1bd2731 --- /dev/null +++ b/freqtrade/freqai/transforms/quantile_transform.py @@ -0,0 +1,28 @@ +from sklearn.preprocessing import QuantileTransformer + + +class FreqaiQuantileTransformer(QuantileTransformer): + """ + A subclass of the SKLearn Quantile that ensures fit, transform, fit_transform and + inverse_transform all take the full set of params X, y, sample_weight required to + benefit from the DataSieve features. + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def fit_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): + super().fit(X) + X = super().transform(X) + return X, y, sample_weight, feature_list + + def fit(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): + super().fit(X) + return X, y, sample_weight, feature_list + + def transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): + X = super().transform(X) + return X, y, sample_weight, feature_list + + def inverse_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): + return super().inverse_transform(X), y, sample_weight, feature_list diff --git a/freqtrade/resolvers/freqaimodel_resolver.py b/freqtrade/resolvers/freqaimodel_resolver.py index 48c3facac..3696b9e56 100644 --- a/freqtrade/resolvers/freqaimodel_resolver.py +++ b/freqtrade/resolvers/freqaimodel_resolver.py @@ -34,7 +34,7 @@ class FreqaiModelResolver(IResolver): Load the custom class from config parameter :param config: configuration dictionary """ - disallowed_models = ["BaseRegressionModel", "BaseTensorFlowModel"] + disallowed_models = ["BaseRegressionModel"] freqaimodel_name = config.get("freqaimodel") if not freqaimodel_name: diff --git a/tests/freqai/test_freqai_datakitchen.py b/tests/freqai/test_freqai_datakitchen.py index e3ef1612c..c067df151 100644 --- a/tests/freqai/test_freqai_datakitchen.py +++ b/tests/freqai/test_freqai_datakitchen.py @@ -10,8 +10,8 @@ from freqtrade.data.dataprovider import DataProvider from freqtrade.exceptions import OperationalException from freqtrade.freqai.data_kitchen import FreqaiDataKitchen from tests.conftest import get_patched_exchange # , log_has_re -from tests.freqai.conftest import (get_patched_data_kitchen, get_patched_freqai_strategy, - make_unfiltered_dataframe) # make_data_dictionary, +from tests.freqai.conftest import make_unfiltered_dataframe # make_data_dictionary, +from tests.freqai.conftest import get_patched_data_kitchen, get_patched_freqai_strategy from tests.freqai.test_freqai_interface import is_mac @@ -72,68 +72,6 @@ def test_check_if_model_expired(mocker, freqai_conf): shutil.rmtree(Path(dk.full_path)) -# def test_use_DBSCAN_to_remove_outliers(mocker, freqai_conf, caplog): -# freqai = make_data_dictionary(mocker, freqai_conf) -# # freqai_conf['freqai']['feature_parameters'].update({"outlier_protection_percentage": 1}) -# freqai.dk.use_DBSCAN_to_remove_outliers(predict=False) -# assert log_has_re(r"DBSCAN found eps of 1\.7\d\.", caplog) - - -# def test_compute_distances(mocker, freqai_conf): -# freqai = make_data_dictionary(mocker, freqai_conf) -# freqai_conf['freqai']['feature_parameters'].update({"DI_threshold": 1}) -# avg_mean_dist = freqai.dk.compute_distances() -# assert round(avg_mean_dist, 2) == 1.98 - - -# def test_use_SVM_to_remove_outliers_and_outlier_protection(mocker, freqai_conf, caplog): -# freqai = make_data_dictionary(mocker, freqai_conf) -# freqai_conf['freqai']['feature_parameters'].update({"outlier_protection_percentage": 0.1}) -# freqai.dk.use_SVM_to_remove_outliers(predict=False) -# assert log_has_re( -# "SVM detected 7.83%", -# caplog, -# ) - - -# def test_compute_inlier_metric(mocker, freqai_conf, caplog): -# freqai = make_data_dictionary(mocker, freqai_conf) -# freqai_conf['freqai']['feature_parameters'].update({"inlier_metric_window": 10}) -# freqai.dk.compute_inlier_metric(set_='train') -# assert log_has_re( -# "Inlier metric computed and added to features.", -# caplog, -# ) - - -# def test_add_noise_to_training_features(mocker, freqai_conf): -# freqai = make_data_dictionary(mocker, freqai_conf) -# freqai_conf['freqai']['feature_parameters'].update({"noise_standard_deviation": 0.1}) -# freqai.dk.add_noise_to_training_features() - - -# def test_remove_beginning_points_from_data_dict(mocker, freqai_conf): -# freqai = make_data_dictionary(mocker, freqai_conf) -# freqai.dk.remove_beginning_points_from_data_dict(set_='train') - - -# def test_principal_component_analysis(mocker, freqai_conf, caplog): -# freqai = make_data_dictionary(mocker, freqai_conf) -# freqai.dk.principal_component_analysis() -# assert log_has_re( -# "reduced feature dimension by", -# caplog, -# ) - - -# def test_normalize_data(mocker, freqai_conf): -# freqai = make_data_dictionary(mocker, freqai_conf) -# data_dict = freqai.dk.data_dictionary -# freqai.dk.normalize_data(data_dict) -# assert any('_max' in entry for entry in freqai.dk.data.keys()) -# assert any('_min' in entry for entry in freqai.dk.data.keys()) - - def test_filter_features(mocker, freqai_conf): freqai, unfiltered_dataframe = make_unfiltered_dataframe(mocker, freqai_conf) freqai.dk.find_features(unfiltered_dataframe) diff --git a/tests/freqai/test_freqai_interface.py b/tests/freqai/test_freqai_interface.py index 61a7b7346..90959ec2c 100644 --- a/tests/freqai/test_freqai_interface.py +++ b/tests/freqai/test_freqai_interface.py @@ -74,6 +74,7 @@ def test_extract_data_and_train_model_Standard(mocker, freqai_conf, model, pca, freqai_conf = make_rl_config(freqai_conf) # test the RL guardrails freqai_conf['freqai']['feature_parameters'].update({"use_SVM_to_remove_outliers": True}) + freqai_conf['freqai']['feature_parameters'].update({"DI_threshold": 2}) freqai_conf['freqai']['data_split_parameters'].update({'shuffle': True}) if 'test_3ac' in model or 'test_4ac' in model: @@ -162,7 +163,6 @@ def test_extract_data_and_train_model_MultiTargets(mocker, freqai_conf, model, s assert Path(freqai.dk.data_path / f"{freqai.dk.model_filename}_model.joblib").is_file() assert Path(freqai.dk.data_path / f"{freqai.dk.model_filename}_metadata.json").is_file() assert Path(freqai.dk.data_path / f"{freqai.dk.model_filename}_trained_df.pkl").is_file() - assert Path(freqai.dk.data_path / f"{freqai.dk.model_filename}_svm_model.joblib").is_file() assert len(freqai.dk.data['training_features_list']) == 14 shutil.rmtree(Path(freqai.dk.full_path)) @@ -218,7 +218,6 @@ def test_extract_data_and_train_model_Classifiers(mocker, freqai_conf, model): f"{freqai.dk.model_filename}_model{model_file_extension}").exists() assert Path(freqai.dk.data_path / f"{freqai.dk.model_filename}_metadata.json").exists() assert Path(freqai.dk.data_path / f"{freqai.dk.model_filename}_trained_df.pkl").exists() - assert Path(freqai.dk.data_path / f"{freqai.dk.model_filename}_svm_model.joblib").exists() shutil.rmtree(Path(freqai.dk.full_path)) @@ -283,9 +282,6 @@ def test_start_backtesting(mocker, freqai_conf, model, num_files, strat, caplog) _, base_df = freqai.dd.get_base_and_corr_dataframes(sub_timerange, "LTC/BTC", freqai.dk) df = base_df[freqai_conf["timeframe"]] - for i in range(5): - df[f'%-constant_{i}'] = i - metadata = {"pair": "LTC/BTC"} freqai.dk.set_paths('LTC/BTC', None) freqai.start_backtesting(df, metadata, freqai.dk, strategy) @@ -293,14 +289,6 @@ def test_start_backtesting(mocker, freqai_conf, model, num_files, strat, caplog) assert len(model_folders) == num_files Trade.use_db = True - assert log_has_re( - "Removed features ", - caplog, - ) - assert log_has_re( - "Removed 5 features from prediction features, ", - caplog, - ) Backtesting.cleanup() shutil.rmtree(Path(freqai.dk.full_path)) @@ -425,36 +413,6 @@ def test_backtesting_fit_live_predictions(mocker, freqai_conf, caplog): shutil.rmtree(Path(freqai.dk.full_path)) -def test_principal_component_analysis(mocker, freqai_conf): - freqai_conf.update({"timerange": "20180110-20180130"}) - freqai_conf.get("freqai", {}).get("feature_parameters", {}).update( - {"princpial_component_analysis": "true"}) - - strategy = get_patched_freqai_strategy(mocker, freqai_conf) - exchange = get_patched_exchange(mocker, freqai_conf) - strategy.dp = DataProvider(freqai_conf, exchange) - strategy.freqai_info = freqai_conf.get("freqai", {}) - freqai = strategy.freqai - freqai.live = True - freqai.dk = FreqaiDataKitchen(freqai_conf) - freqai.dk.live = True - timerange = TimeRange.parse_timerange("20180110-20180130") - freqai.dd.load_all_pair_histories(timerange, freqai.dk) - - freqai.dd.pair_dict = MagicMock() - - data_load_timerange = TimeRange.parse_timerange("20180110-20180130") - new_timerange = TimeRange.parse_timerange("20180120-20180130") - freqai.dk.set_paths('ADA/BTC', None) - - freqai.extract_data_and_train_model( - new_timerange, "ADA/BTC", strategy, freqai.dk, data_load_timerange) - - assert Path(freqai.dk.data_path / f"{freqai.dk.model_filename}_pca_object.pkl") - - shutil.rmtree(Path(freqai.dk.full_path)) - - def test_plot_feature_importance(mocker, freqai_conf): from freqtrade.freqai.utils import plot_feature_importance From 62378068170dc4144e52042e0774502680c75662 Mon Sep 17 00:00:00 2001 From: robcaulk Date: Mon, 29 May 2023 15:18:28 +0200 Subject: [PATCH 03/32] bump datasieve to 0.0.8 --- requirements-freqai.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-freqai.txt b/requirements-freqai.txt index 66da4e873..0cc5762e0 100644 --- a/requirements-freqai.txt +++ b/requirements-freqai.txt @@ -10,4 +10,4 @@ catboost==1.2; 'arm' not in platform_machine and (sys_platform != 'darwin' or py lightgbm==3.3.5 xgboost==1.7.5 tensorboard==2.13.0 -datasieve==0.0.5 +datasieve==0.0.8 From 785f0d396f954805cc73ad5f532fa174a5325adf Mon Sep 17 00:00:00 2001 From: robcaulk Date: Mon, 29 May 2023 16:44:53 +0200 Subject: [PATCH 04/32] bump datasieve version --- requirements-freqai.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-freqai.txt b/requirements-freqai.txt index 0cc5762e0..81d49eee4 100644 --- a/requirements-freqai.txt +++ b/requirements-freqai.txt @@ -10,4 +10,4 @@ catboost==1.2; 'arm' not in platform_machine and (sys_platform != 'darwin' or py lightgbm==3.3.5 xgboost==1.7.5 tensorboard==2.13.0 -datasieve==0.0.8 +datasieve==0.0.9 From f6a32f4ffd91def67a98f77a2aafe513185805cd Mon Sep 17 00:00:00 2001 From: robcaulk Date: Mon, 29 May 2023 23:35:24 +0200 Subject: [PATCH 05/32] bump version --- freqtrade/freqai/data_kitchen.py | 115 --------------------------- freqtrade/freqai/freqai_interface.py | 6 +- requirements-freqai.txt | 2 +- 3 files changed, 5 insertions(+), 118 deletions(-) diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index 04182dc69..127193a35 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -77,8 +77,6 @@ class FreqaiDataKitchen: self.backtest_predictions_folder: str = "backtesting_predictions" self.live = live self.pair = pair - - # self.svm_model: linear_model.SGDOneClassSVM = None self.keras: bool = self.freqai_config.get("keras", False) self.set_all_pairs() self.backtest_live_models = config.get("freqai_backtest_live_models", False) @@ -225,13 +223,6 @@ class FreqaiDataKitchen: drop_index = pd.isnull(filtered_df).any(axis=1) # get the rows that have NaNs, drop_index = drop_index.replace(True, 1).replace(False, 0) # pep8 requirement. if (training_filter): - # const_cols = list((filtered_df.nunique() == 1).loc[lambda x: x].index) - # if const_cols: - # filtered_df = filtered_df.filter(filtered_df.columns.difference(const_cols)) - # self.data['constant_features_list'] = const_cols - # logger.warning(f"Removed features {const_cols} with constant values.") - # else: - # self.data['constant_features_list'] = [] # we don't care about total row number (total no. datapoints) in training, we only care # about removing any row with NaNs @@ -264,9 +255,6 @@ class FreqaiDataKitchen: else: - # if 'constant_features_list' in self.data and len(self.data['constant_features_list']): - # filtered_df = self.check_pred_labels(filtered_df) - # we are backtesting so we need to preserve row number to send back to strategy, # so now we use do_predict to avoid any prediction based on a NaN drop_index = pd.isnull(filtered_df).any(axis=1) @@ -308,107 +296,6 @@ class FreqaiDataKitchen: return self.data_dictionary - # def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]: - # """ - # Normalize all data in the data_dictionary according to the training dataset - # :param data_dictionary: dictionary containing the cleaned and - # split training/test data/labels - # :returns: - # :data_dictionary: updated dictionary with standardized values. - # """ - - # # standardize the data by training stats - # train_max = data_dictionary["train_features"].max() - # train_min = data_dictionary["train_features"].min() - # data_dictionary["train_features"] = ( - # 2 * (data_dictionary["train_features"] - train_min) / (train_max - train_min) - 1 - # ) - # data_dictionary["test_features"] = ( - # 2 * (data_dictionary["test_features"] - train_min) / (train_max - train_min) - 1 - # ) - - # for item in train_max.keys(): - # self.data[item + "_max"] = train_max[item] - # self.data[item + "_min"] = train_min[item] - - # for item in data_dictionary["train_labels"].keys(): - # if data_dictionary["train_labels"][item].dtype == object: - # continue - # train_labels_max = data_dictionary["train_labels"][item].max() - # train_labels_min = data_dictionary["train_labels"][item].min() - # data_dictionary["train_labels"][item] = ( - # 2 - # * (data_dictionary["train_labels"][item] - train_labels_min) - # / (train_labels_max - train_labels_min) - # - 1 - # ) - # if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0: - # data_dictionary["test_labels"][item] = ( - # 2 - # * (data_dictionary["test_labels"][item] - train_labels_min) - # / (train_labels_max - train_labels_min) - # - 1 - # ) - - # self.data[f"{item}_max"] = train_labels_max - # self.data[f"{item}_min"] = train_labels_min - # return data_dictionary - - # def normalize_single_dataframe(self, df: DataFrame) -> DataFrame: - - # train_max = df.max() - # train_min = df.min() - # df = ( - # 2 * (df - train_min) / (train_max - train_min) - 1 - # ) - - # for item in train_max.keys(): - # self.data[item + "_max"] = train_max[item] - # self.data[item + "_min"] = train_min[item] - - # return df - - # def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame: - # """ - # Normalize a set of data using the mean and standard deviation from - # the associated training data. - # :param df: Dataframe to be standardized - # """ - - # train_max = [None] * len(df.keys()) - # train_min = [None] * len(df.keys()) - - # for i, item in enumerate(df.keys()): - # train_max[i] = self.data[f"{item}_max"] - # train_min[i] = self.data[f"{item}_min"] - - # train_max_series = pd.Series(train_max, index=df.keys()) - # train_min_series = pd.Series(train_min, index=df.keys()) - - # df = ( - # 2 * (df - train_min_series) / (train_max_series - train_min_series) - 1 - # ) - - # return df - - # def denormalize_labels_from_metadata(self, df: DataFrame) -> DataFrame: - # """ - # Denormalize a set of data using the mean and standard deviation from - # the associated training data. - # :param df: Dataframe of predictions to be denormalized - # """ - - # for label in df.columns: - # if df[label].dtype == object or label in self.unique_class_list: - # continue - # df[label] = ( - # (df[label] + 1) - # * (self.data[f"{label}_max"] - self.data[f"{label}_min"]) - # / 2 - # ) + self.data[f"{label}_min"] - - # return df - def split_timerange( self, tr: str, train_split: int = 28, bt_split: float = 7 ) -> Tuple[list, list]: @@ -453,9 +340,7 @@ class FreqaiDataKitchen: tr_training_list_timerange.append(copy.deepcopy(timerange_train)) # associated backtest period - timerange_backtest.startts = timerange_train.stopts - timerange_backtest.stopts = timerange_backtest.startts + int(bt_period) if timerange_backtest.stopts > config_timerange.stopts: diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py index 6dfa9855c..3f04b17fb 100644 --- a/freqtrade/freqai/freqai_interface.py +++ b/freqtrade/freqai/freqai_interface.py @@ -507,8 +507,10 @@ class IFreqaiModel(ABC): def define_data_pipeline(self, dk: FreqaiDataKitchen) -> None: ft_params = self.freqai_info["feature_parameters"] - dk.feature_pipeline = Pipeline( - [('scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1)))]) + dk.feature_pipeline = Pipeline([ + ('const', ds.DataSieveVarianceThreshold(threshold=0)), + ('scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1))) + ]) if ft_params.get("principal_component_analysis", False): dk.feature_pipeline.steps += [('pca', ds.DataSievePCA())] diff --git a/requirements-freqai.txt b/requirements-freqai.txt index 81d49eee4..31c73b594 100644 --- a/requirements-freqai.txt +++ b/requirements-freqai.txt @@ -10,4 +10,4 @@ catboost==1.2; 'arm' not in platform_machine and (sys_platform != 'darwin' or py lightgbm==3.3.5 xgboost==1.7.5 tensorboard==2.13.0 -datasieve==0.0.9 +datasieve==0.1.0 From 94bc91ef57d0213b973342ee26e80b8473a233c6 Mon Sep 17 00:00:00 2001 From: Robert Caulk Date: Sun, 4 Jun 2023 21:50:13 +0200 Subject: [PATCH 06/32] Update tests/freqai/test_freqai_datakitchen.py Co-authored-by: Matthias --- tests/freqai/test_freqai_datakitchen.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/freqai/test_freqai_datakitchen.py b/tests/freqai/test_freqai_datakitchen.py index c067df151..77816749f 100644 --- a/tests/freqai/test_freqai_datakitchen.py +++ b/tests/freqai/test_freqai_datakitchen.py @@ -9,8 +9,8 @@ from freqtrade.configuration import TimeRange from freqtrade.data.dataprovider import DataProvider from freqtrade.exceptions import OperationalException from freqtrade.freqai.data_kitchen import FreqaiDataKitchen -from tests.conftest import get_patched_exchange # , log_has_re -from tests.freqai.conftest import make_unfiltered_dataframe # make_data_dictionary, +from tests.conftest import get_patched_exchange +from tests.freqai.conftest import make_unfiltered_dataframe from tests.freqai.conftest import get_patched_data_kitchen, get_patched_freqai_strategy from tests.freqai.test_freqai_interface import is_mac From 5ac141f72b2df55d4ef9444a746860f73a82b8e6 Mon Sep 17 00:00:00 2001 From: robcaulk Date: Tue, 6 Jun 2023 21:05:51 +0200 Subject: [PATCH 07/32] convert to new datasieve api --- docs/freqai-feature-engineering.md | 35 ++----------------- freqtrade/freqai/freqai_interface.py | 22 ++++++------ .../prediction_models/XGBoostRegressor.py | 6 ++-- freqtrade/freqai/transforms/__init__.py | 6 ---- .../freqai/transforms/quantile_transform.py | 28 --------------- requirements-freqai.txt | 2 +- 6 files changed, 18 insertions(+), 81 deletions(-) delete mode 100644 freqtrade/freqai/transforms/__init__.py delete mode 100644 freqtrade/freqai/transforms/quantile_transform.py diff --git a/docs/freqai-feature-engineering.md b/docs/freqai-feature-engineering.md index eb4b4272e..0eee0793b 100644 --- a/docs/freqai-feature-engineering.md +++ b/docs/freqai-feature-engineering.md @@ -254,47 +254,18 @@ Users are encouraged to customize the data pipeline to their needs by building t """ User defines their custom eature pipeline here (if they wish) """ - from freqtrade.freqai.transforms import FreqaiQuantileTransformer + from sklearn.preprocessing import QuantileTransformer dk.feature_pipeline = Pipeline([ - ('qt', FreqaiQuantileTransformer(output_distribution='normal')) + ('qt', SKLearnWrapper(QuantileTransformer(output_distribution='normal'))) ]) return ``` -Here, you are defining the exact pipeline that will be used for your feature set during training and prediction. If you have a custom step that you would like to add to the pipeline, you simply create a class that follows the DataSieve/SKLearn API. That means your step must have a `fit()`, `transform()`, `fit_transform()`, and `inverse_transform()` method. You can see examples of this in the `freqtrade.freqai.transforms` module where we use SKLearn `QuantileNormalization` to create a new step for the pipeline. +Here, you are defining the exact pipeline that will be used for your feature set during training and prediction. Here you can use *most* SKLearn transformation steps by wrapping them in the `SKLearnWrapper` class. As there is the `feature_pipeline`, there also exists a definition for the `label_pipeline` which can be defined the same way as the `feature_pipeline`, by overriding `define_label_pipeline`. -!!! note "Inheritence required" - While most SKLearn methods are very easy to override, as shown in freqtrade/freqai/transforms/quantile_transform.py, they still need to include passing X, y, and sample_weights through all `fit()`, `transform()`, `fit_transform()` and `inverse_transform()` functions, even if that means a direct pass through without modifications. - - - ## Outlier detection Equity and crypto markets suffer from a high level of non-patterned noise in the form of outlier data points. FreqAI implements a variety of methods to identify such outliers and hence mitigate risk. diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py index 3f04b17fb..ffe0ee8c3 100644 --- a/freqtrade/freqai/freqai_interface.py +++ b/freqtrade/freqai/freqai_interface.py @@ -12,8 +12,10 @@ import numpy as np import pandas as pd import psutil from datasieve.pipeline import Pipeline +from datasieve.transforms import SKLearnWrapper from numpy.typing import NDArray from pandas import DataFrame +from sklearn.preprocessing import MinMaxScaler from freqtrade.configuration import TimeRange from freqtrade.constants import Config @@ -509,25 +511,25 @@ class IFreqaiModel(ABC): ft_params = self.freqai_info["feature_parameters"] dk.feature_pipeline = Pipeline([ ('const', ds.DataSieveVarianceThreshold(threshold=0)), - ('scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1))) + ('scaler', SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1)))) ]) if ft_params.get("principal_component_analysis", False): - dk.feature_pipeline.steps += [('pca', ds.DataSievePCA())] - dk.feature_pipeline.steps += [('post-pca-scaler', - ds.DataSieveMinMaxScaler(feature_range=(-1, 1)))] + dk.feature_pipeline.append(('pca', ds.DataSievePCA())) + dk.feature_pipeline.append(('post-pca-scaler', + SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1))))) if ft_params.get("use_SVM_to_remove_outliers", False): svm_params = ft_params.get( "svm_params", {"shuffle": False, "nu": 0.01}) - dk.feature_pipeline.steps += [('svm', ds.SVMOutlierExtractor(**svm_params))] + dk.feature_pipeline.append(('svm', ds.SVMOutlierExtractor(**svm_params))) di = ft_params.get("DI_threshold", 0) if di: - dk.feature_pipeline.steps += [('di', ds.DissimilarityIndex(di_threshold=di))] + dk.feature_pipeline.append(('di', ds.DissimilarityIndex(di_threshold=di))) if ft_params.get("use_DBSCAN_to_remove_outliers", False): - dk.feature_pipeline.steps += [('dbscan', ds.DataSieveDBSCAN())] + dk.feature_pipeline.append(('dbscan', ds.DataSieveDBSCAN())) dk.feature_pipeline.fitparams = dk.feature_pipeline._validate_fitparams( {}, dk.feature_pipeline.steps) @@ -538,7 +540,7 @@ class IFreqaiModel(ABC): def define_label_pipeline(self, dk: FreqaiDataKitchen) -> None: dk.label_pipeline = Pipeline([ - ('scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1))) + ('scaler', SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1)))) ]) def model_exists(self, dk: FreqaiDataKitchen) -> bool: @@ -551,8 +553,6 @@ class IFreqaiModel(ABC): """ if self.dd.model_type == 'joblib': file_type = ".joblib" - elif self.dd.model_type == 'keras': - file_type = ".h5" elif self.dd.model_type in ["stable_baselines3", "sb3_contrib", "pytorch"]: file_type = ".zip" @@ -676,7 +676,7 @@ class IFreqaiModel(ABC): # # for keras type models, the conv_window needs to be prepended so # # viewing is correct in frequi - if self.freqai_info.get('keras', False) or self.ft_params.get('inlier_metric_window', 0): + if self.ft_params.get('inlier_metric_window', 0): n_lost_points = self.freqai_info.get('conv_width', 2) zeros_df = DataFrame(np.zeros((n_lost_points, len(hist_preds_df.columns))), columns=hist_preds_df.columns) diff --git a/freqtrade/freqai/prediction_models/XGBoostRegressor.py b/freqtrade/freqai/prediction_models/XGBoostRegressor.py index 88d348448..19c051b91 100644 --- a/freqtrade/freqai/prediction_models/XGBoostRegressor.py +++ b/freqtrade/freqai/prediction_models/XGBoostRegressor.py @@ -9,7 +9,7 @@ from freqtrade.freqai.tensorboard import TBCallback # from datasieve.pipeline import Pipeline -# from freqtrade.freqai.transforms import FreqaiQuantileTransformer +# from sklearn.preprocessing import QuantileTransformer logger = logging.getLogger(__name__) @@ -61,7 +61,7 @@ class XGBoostRegressor(BaseRegressionModel): # User defines their custom eature pipeline here (if they wish) # """ # dk.feature_pipeline = Pipeline([ - # ('qt', FreqaiQuantileTransformer(output_distribution='normal')) + # ('qt', SKLearnWrapper(QuantileTransformer(output_distribution='normal'))) # ]) # return @@ -71,7 +71,7 @@ class XGBoostRegressor(BaseRegressionModel): # User defines their custom label pipeline here (if they wish) # """ # dk.label_pipeline = Pipeline([ - # ('qt', FreqaiQuantileTransformer(output_distribution='normal')) + # ('qt', SKLearnWrapper(QuantileTransformer(output_distribution='normal'))) # ]) # return diff --git a/freqtrade/freqai/transforms/__init__.py b/freqtrade/freqai/transforms/__init__.py deleted file mode 100644 index 9b7d8ccf5..000000000 --- a/freqtrade/freqai/transforms/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from freqtrade.freqai.transforms.quantile_transform import FreqaiQuantileTransformer - - -__all__ = ( - "FreqaiQuantileTransformer", -) diff --git a/freqtrade/freqai/transforms/quantile_transform.py b/freqtrade/freqai/transforms/quantile_transform.py deleted file mode 100644 index 3d1bd2731..000000000 --- a/freqtrade/freqai/transforms/quantile_transform.py +++ /dev/null @@ -1,28 +0,0 @@ -from sklearn.preprocessing import QuantileTransformer - - -class FreqaiQuantileTransformer(QuantileTransformer): - """ - A subclass of the SKLearn Quantile that ensures fit, transform, fit_transform and - inverse_transform all take the full set of params X, y, sample_weight required to - benefit from the DataSieve features. - """ - - def __init__(self, **kwargs): - super().__init__(**kwargs) - - def fit_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): - super().fit(X) - X = super().transform(X) - return X, y, sample_weight, feature_list - - def fit(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): - super().fit(X) - return X, y, sample_weight, feature_list - - def transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): - X = super().transform(X) - return X, y, sample_weight, feature_list - - def inverse_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): - return super().inverse_transform(X), y, sample_weight, feature_list diff --git a/requirements-freqai.txt b/requirements-freqai.txt index 31c73b594..748950e24 100644 --- a/requirements-freqai.txt +++ b/requirements-freqai.txt @@ -10,4 +10,4 @@ catboost==1.2; 'arm' not in platform_machine and (sys_platform != 'darwin' or py lightgbm==3.3.5 xgboost==1.7.5 tensorboard==2.13.0 -datasieve==0.1.0 +datasieve==0.1.1 From f7f88aa14d5547cb827516dc6538ce4a94676c70 Mon Sep 17 00:00:00 2001 From: robcaulk Date: Wed, 7 Jun 2023 09:28:56 +0200 Subject: [PATCH 08/32] fix pickle file name --- freqtrade/freqai/data_drawer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/freqtrade/freqai/data_drawer.py b/freqtrade/freqai/data_drawer.py index 670dfc620..067790b9a 100644 --- a/freqtrade/freqai/data_drawer.py +++ b/freqtrade/freqai/data_drawer.py @@ -523,7 +523,7 @@ class FreqaiDataDrawer: dk.data_dictionary["train_features"] = pd.read_pickle( dk.data_path / f"{dk.model_filename}_trained_df.pkl" ) - with (dk.data_path / f"{dk.model_filename}_pipeline.pkl").open("rb") as fp: + with (dk.data_path / f"{dk.model_filename}_feature_pipeline.pkl").open("rb") as fp: dk.feature_pipeline = cloudpickle.load(fp) with (dk.data_path / f"{dk.model_filename}_label_pipeline.pkl").open("rb") as fp: dk.label_pipeline = cloudpickle.load(fp) From 4d4589becded66667ae91ff2b91f8dc6712fa81a Mon Sep 17 00:00:00 2001 From: robcaulk Date: Wed, 7 Jun 2023 14:00:00 +0200 Subject: [PATCH 09/32] fix isort in tests --- tests/freqai/test_freqai_datakitchen.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/freqai/test_freqai_datakitchen.py b/tests/freqai/test_freqai_datakitchen.py index 77816749f..8d09cfc58 100644 --- a/tests/freqai/test_freqai_datakitchen.py +++ b/tests/freqai/test_freqai_datakitchen.py @@ -10,8 +10,8 @@ from freqtrade.data.dataprovider import DataProvider from freqtrade.exceptions import OperationalException from freqtrade.freqai.data_kitchen import FreqaiDataKitchen from tests.conftest import get_patched_exchange -from tests.freqai.conftest import make_unfiltered_dataframe -from tests.freqai.conftest import get_patched_data_kitchen, get_patched_freqai_strategy +from tests.freqai.conftest import (get_patched_data_kitchen, get_patched_freqai_strategy, + make_unfiltered_dataframe) from tests.freqai.test_freqai_interface import is_mac From dc577d2a1a751180d0c80b4d76c6185ce2d3332d Mon Sep 17 00:00:00 2001 From: robcaulk Date: Wed, 7 Jun 2023 17:58:27 +0200 Subject: [PATCH 10/32] update to new datasieve interface, add noise to pipeline --- freqtrade/freqai/data_kitchen.py | 11 ----------- freqtrade/freqai/freqai_interface.py | 13 +++++++------ requirements-freqai.txt | 2 +- tests/freqai/test_freqai_interface.py | 28 ++++++++++++++------------- 4 files changed, 23 insertions(+), 31 deletions(-) diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index 127193a35..ecdb2e109 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -371,17 +371,6 @@ class FreqaiDataKitchen: return df - def add_noise_to_training_features(self) -> None: - """ - Add noise to train features to reduce the risk of overfitting. - """ - mu = 0 # no shift - sigma = self.freqai_config["feature_parameters"]["noise_standard_deviation"] - compute_df = self.data_dictionary['train_features'] - noise = np.random.normal(mu, sigma, [compute_df.shape[0], compute_df.shape[1]]) - self.data_dictionary['train_features'] += noise - return - def find_features(self, dataframe: DataFrame) -> None: """ Find features in the strategy provided dataframe diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py index ffe0ee8c3..632266b00 100644 --- a/freqtrade/freqai/freqai_interface.py +++ b/freqtrade/freqai/freqai_interface.py @@ -510,12 +510,12 @@ class IFreqaiModel(ABC): def define_data_pipeline(self, dk: FreqaiDataKitchen) -> None: ft_params = self.freqai_info["feature_parameters"] dk.feature_pipeline = Pipeline([ - ('const', ds.DataSieveVarianceThreshold(threshold=0)), + ('const', ds.VarianceThreshold(threshold=0)), ('scaler', SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1)))) ]) if ft_params.get("principal_component_analysis", False): - dk.feature_pipeline.append(('pca', ds.DataSievePCA())) + dk.feature_pipeline.append(('pca', ds.PCA())) dk.feature_pipeline.append(('post-pca-scaler', SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1))))) @@ -529,14 +529,15 @@ class IFreqaiModel(ABC): dk.feature_pipeline.append(('di', ds.DissimilarityIndex(di_threshold=di))) if ft_params.get("use_DBSCAN_to_remove_outliers", False): - dk.feature_pipeline.append(('dbscan', ds.DataSieveDBSCAN())) + dk.feature_pipeline.append(('dbscan', ds.DBSCAN())) + + sigma = self.freqai_info["feature_parameters"].get('noise_standard_deviation', 0) + if sigma: + dk.feature_pipeline.append(('noise', ds.Noise(sigma=sigma))) dk.feature_pipeline.fitparams = dk.feature_pipeline._validate_fitparams( {}, dk.feature_pipeline.steps) - # if self.freqai_info["feature_parameters"].get('noise_standard_deviation', 0): - # dk.pipeline.extend(('noise', ds.Noise())) - def define_label_pipeline(self, dk: FreqaiDataKitchen) -> None: dk.label_pipeline = Pipeline([ diff --git a/requirements-freqai.txt b/requirements-freqai.txt index 748950e24..a515ba2b5 100644 --- a/requirements-freqai.txt +++ b/requirements-freqai.txt @@ -10,4 +10,4 @@ catboost==1.2; 'arm' not in platform_machine and (sys_platform != 'darwin' or py lightgbm==3.3.5 xgboost==1.7.5 tensorboard==2.13.0 -datasieve==0.1.1 +datasieve==0.1.2 diff --git a/tests/freqai/test_freqai_interface.py b/tests/freqai/test_freqai_interface.py index 90959ec2c..fec9de724 100644 --- a/tests/freqai/test_freqai_interface.py +++ b/tests/freqai/test_freqai_interface.py @@ -37,21 +37,22 @@ def can_run_model(model: str) -> None: pytest.skip("Reinforcement learning / PyTorch module not available on intel based Mac OS.") -@pytest.mark.parametrize('model, pca, dbscan, float32, can_short, shuffle, buffer', [ - ('LightGBMRegressor', True, False, True, True, False, 0), - ('XGBoostRegressor', False, True, False, True, False, 10), - ('XGBoostRFRegressor', False, False, False, True, False, 0), - ('CatboostRegressor', False, False, False, True, True, 0), - ('PyTorchMLPRegressor', False, False, False, False, False, 0), - ('PyTorchTransformerRegressor', False, False, False, False, False, 0), - ('ReinforcementLearner', False, True, False, True, False, 0), - ('ReinforcementLearner_multiproc', False, False, False, True, False, 0), - ('ReinforcementLearner_test_3ac', False, False, False, False, False, 0), - ('ReinforcementLearner_test_3ac', False, False, False, True, False, 0), - ('ReinforcementLearner_test_4ac', False, False, False, True, False, 0), +@pytest.mark.parametrize('model, pca, dbscan, float32, can_short, shuffle, buffer, noise', [ + ('LightGBMRegressor', True, False, True, True, False, 0, 0), + ('XGBoostRegressor', False, True, False, True, False, 10, 0.05), + ('XGBoostRFRegressor', False, False, False, True, False, 0, 0), + ('CatboostRegressor', False, False, False, True, True, 0, 0), + ('PyTorchMLPRegressor', False, False, False, False, False, 0, 0), + ('PyTorchTransformerRegressor', False, False, False, False, False, 0, 0), + ('ReinforcementLearner', False, True, False, True, False, 0, 0), + ('ReinforcementLearner_multiproc', False, False, False, True, False, 0, 0), + ('ReinforcementLearner_test_3ac', False, False, False, False, False, 0, 0), + ('ReinforcementLearner_test_3ac', False, False, False, True, False, 0, 0), + ('ReinforcementLearner_test_4ac', False, False, False, True, False, 0, 0), ]) def test_extract_data_and_train_model_Standard(mocker, freqai_conf, model, pca, - dbscan, float32, can_short, shuffle, buffer): + dbscan, float32, can_short, shuffle, + buffer, noise): can_run_model(model) @@ -68,6 +69,7 @@ def test_extract_data_and_train_model_Standard(mocker, freqai_conf, model, pca, freqai_conf.update({"reduce_df_footprint": float32}) freqai_conf['freqai']['feature_parameters'].update({"shuffle_after_split": shuffle}) freqai_conf['freqai']['feature_parameters'].update({"buffer_train_data_candles": buffer}) + freqai_conf['freqai']['feature_parameters'].update({"noise_standard_deviation": noise}) if 'ReinforcementLearner' in model: model_save_ext = 'zip' From 135aaa2be2450da404693ae45a90e1461443d62e Mon Sep 17 00:00:00 2001 From: robcaulk Date: Wed, 7 Jun 2023 18:26:49 +0200 Subject: [PATCH 11/32] update docs, improve the interaction with `define_data_pipeline` --- docs/freqai-feature-engineering.md | 21 ++++++---- .../RL/BaseReinforcementLearningModel.py | 37 +++++++++--------- .../freqai/base_models/BaseClassifierModel.py | 29 +++++++------- .../base_models/BasePyTorchClassifier.py | 32 ++++++++-------- .../base_models/BasePyTorchRegressor.py | 38 +++++++++---------- .../freqai/base_models/BaseRegressionModel.py | 35 +++++++++-------- freqtrade/freqai/freqai_interface.py | 30 ++++++++------- 7 files changed, 114 insertions(+), 108 deletions(-) diff --git a/docs/freqai-feature-engineering.md b/docs/freqai-feature-engineering.md index 0eee0793b..364b920a1 100644 --- a/docs/freqai-feature-engineering.md +++ b/docs/freqai-feature-engineering.md @@ -226,8 +226,10 @@ FreqAI uses the the [`DataSieve`](https://github.com/emergentmethods/datasieve) This means that users can use/customize any SKLearn modules and easily add them to their FreqAI data pipeline. By default, FreqAI builds the following pipeline: ```py +from datasieve.transforms import SKLearnWrapper, DissimilarityIndex +from datasieve.pipeline import Pipeline dk.feature_pipeline = Pipeline([ - ('scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1))), + ('scaler', SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1))), ('di', ds.DissimilarityIndex(di_threshold=1)), ]) ``` @@ -235,10 +237,12 @@ dk.feature_pipeline = Pipeline([ But users will find that they can add PCA and other steps just by changing their configuration settings, for example, if you add `"principal_component_analysis": true` to the `feature_parameters` dict in the `freqai` config, then FreqAI will add the PCA step for you resulting in the following pipeline: ```py +from datasieve.transforms import SKLearnWrapper, DissimilarityIndex, PCA +from datasieve.pipeline import Pipeline dk.feature_pipeline = Pipeline([ - ('scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1))), - ('pca', ds.DataSievePCA()), - ('post-pca-scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1))) + ('scaler', SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1)))), + ('pca', ds.PCA()), + ('post-pca-scaler', ds.MinMaxScaler(feature_range=(-1, 1))) ('di', ds.DissimilarityIndex(di_threshold=1)), ]) ``` @@ -247,16 +251,19 @@ The same concept follows if users activate other config options like `"use_SVM_t ## Customizing the pipeline -Users are encouraged to customize the data pipeline to their needs by building their own data pipeline. This can be done by overriding `define_data_pipeline` in their `IFreqaiModel`. For example: +Users are encouraged to customize the data pipeline to their needs by building their own data pipeline. This can be done by simply setting `dk.feature_pipeline` to their desired `Pipeline` object inside their `IFreqaiModel` `train()` function, or if they prefer not to touch the `train()` function, they can override `define_data_pipeline` in their `IFreqaiModel`: ```py + from datasieve.transforms import SKLearnWrapper, DissimilarityIndex + from datasieve.pipeline import Pipeline + from sklearn.preprocessing import QuantileTransformer def define_data_pipeline(self, dk: FreqaiDataKitchen) -> None: """ User defines their custom eature pipeline here (if they wish) """ - from sklearn.preprocessing import QuantileTransformer dk.feature_pipeline = Pipeline([ - ('qt', SKLearnWrapper(QuantileTransformer(output_distribution='normal'))) + ('qt', SKLearnWrapper(QuantileTransformer(output_distribution='normal'))), + ('di', ds.DissimilarityIndex(di_threshold=1) ]) return diff --git a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py index bd22decaa..90e60ec5c 100644 --- a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py +++ b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py @@ -110,40 +110,37 @@ class BaseReinforcementLearningModel(IFreqaiModel): training_filter=True, ) - d: Dict[str, Any] = dk.make_train_test_datasets( + dd: Dict[str, Any] = dk.make_train_test_datasets( features_filtered, labels_filtered) - self.df_raw = copy.deepcopy(d["train_features"]) + self.df_raw = copy.deepcopy(dd["train_features"]) dk.fit_labels() # FIXME useless for now, but just satiating append methods # normalize all data based on train_dataset only prices_train, prices_test = self.build_ohlc_price_dataframes(dk.data_dictionary, pair, dk) - self.define_data_pipeline(dk) - self.define_label_pipeline(dk) + dk.feature_pipeline = self.define_data_pipeline() + dk.label_pipeline = self.define_label_pipeline() - # d["train_labels"], _, _ = dk.label_pipeline.fit_transform(d["train_labels"]) - # d["test_labels"], _, _ = dk.label_pipeline.transform(d["test_labels"]) + (dd["train_features"], + dd["train_labels"], + dd["train_weights"]) = dk.feature_pipeline.fit_transform(dd["train_features"], + dd["train_labels"], + dd["train_weights"]) - (d["train_features"], - d["train_labels"], - d["train_weights"]) = dk.feature_pipeline.fit_transform(d["train_features"], - d["train_labels"], - d["train_weights"]) - - (d["test_features"], - d["test_labels"], - d["test_weights"]) = dk.feature_pipeline.transform(d["test_features"], - d["test_labels"], - d["test_weights"]) + (dd["test_features"], + dd["test_labels"], + dd["test_weights"]) = dk.feature_pipeline.transform(dd["test_features"], + dd["test_labels"], + dd["test_weights"]) logger.info( f'Training model on {len(dk.data_dictionary["train_features"].columns)}' - f' features and {len(d["train_features"])} data points' + f' features and {len(dd["train_features"])} data points' ) - self.set_train_and_eval_environments(d, prices_train, prices_test, dk) + self.set_train_and_eval_environments(dd, prices_train, prices_test, dk) - model = self.fit(d, dk) + model = self.fit(dd, dk) logger.info(f"--------------------done training {pair}--------------------") diff --git a/freqtrade/freqai/base_models/BaseClassifierModel.py b/freqtrade/freqai/base_models/BaseClassifierModel.py index 179e8a5af..8495cd9b9 100644 --- a/freqtrade/freqai/base_models/BaseClassifierModel.py +++ b/freqtrade/freqai/base_models/BaseClassifierModel.py @@ -50,30 +50,29 @@ class BaseClassifierModel(IFreqaiModel): logger.info(f"-------------------- Training on data from {start_date} to " f"{end_date} --------------------") # split data into train/test data. - d = dk.make_train_test_datasets(features_filtered, labels_filtered) + dd = dk.make_train_test_datasets(features_filtered, labels_filtered) if not self.freqai_info.get("fit_live_predictions_candles", 0) or not self.live: dk.fit_labels() - self.define_data_pipeline(dk) - self.define_label_pipeline(dk) + dk.feature_pipeline = self.define_data_pipeline() - (d["train_features"], - d["train_labels"], - d["train_weights"]) = dk.feature_pipeline.fit_transform(d["train_features"], - d["train_labels"], - d["train_weights"]) + (dd["train_features"], + dd["train_labels"], + dd["train_weights"]) = dk.feature_pipeline.fit_transform(dd["train_features"], + dd["train_labels"], + dd["train_weights"]) - (d["test_features"], - d["test_labels"], - d["test_weights"]) = dk.feature_pipeline.transform(d["test_features"], - d["test_labels"], - d["test_weights"]) + (dd["test_features"], + dd["test_labels"], + dd["test_weights"]) = dk.feature_pipeline.transform(dd["test_features"], + dd["test_labels"], + dd["test_weights"]) logger.info( f"Training model on {len(dk.data_dictionary['train_features'].columns)} features" ) - logger.info(f"Training model on {len(d['train_features'])} data points") + logger.info(f"Training model on {len(dd['train_features'])} data points") - model = self.fit(d, dk) + model = self.fit(dd, dk) end_time = time() diff --git a/freqtrade/freqai/base_models/BasePyTorchClassifier.py b/freqtrade/freqai/base_models/BasePyTorchClassifier.py index 448384852..85328aa41 100644 --- a/freqtrade/freqai/base_models/BasePyTorchClassifier.py +++ b/freqtrade/freqai/base_models/BasePyTorchClassifier.py @@ -36,6 +36,7 @@ class BasePyTorchClassifier(BasePyTorchModel): return dataframe """ + def __init__(self, **kwargs): super().__init__(**kwargs) self.class_name_to_index = None @@ -184,31 +185,30 @@ class BasePyTorchClassifier(BasePyTorchModel): ) # split data into train/test data. - d = dk.make_train_test_datasets(features_filtered, labels_filtered) - if not self.freqai_info.get("fit_live_predictions", 0) or not self.live: + dd = dk.make_train_test_datasets(features_filtered, labels_filtered) + if not self.freqai_info.get("fit_live_predictions_candles", 0) or not self.live: dk.fit_labels() - d["train_labels"], _, _ = dk.label_pipeline.fit_transform(d["train_labels"]) - d["test_labels"], _, _ = dk.label_pipeline.transform(d["test_labels"]) + dk.feature_pipeline = self.define_data_pipeline() - (d["train_features"], - d["train_labels"], - d["train_weights"]) = dk.feature_pipeline.fit_transform(d["train_features"], - d["train_labels"], - d["train_weights"]) + (dd["train_features"], + dd["train_labels"], + dd["train_weights"]) = dk.feature_pipeline.fit_transform(dd["train_features"], + dd["train_labels"], + dd["train_weights"]) - (d["test_features"], - d["test_labels"], - d["test_weights"]) = dk.feature_pipeline.transform(d["test_features"], - d["test_labels"], - d["test_weights"]) + (dd["test_features"], + dd["test_labels"], + dd["test_weights"]) = dk.feature_pipeline.transform(dd["test_features"], + dd["test_labels"], + dd["test_weights"]) logger.info( f"Training model on {len(dk.data_dictionary['train_features'].columns)} features" ) - logger.info(f"Training model on {len(d['train_features'])} data points") + logger.info(f"Training model on {len(dd['train_features'])} data points") - model = self.fit(d, dk) + model = self.fit(dd, dk) end_time = time() logger.info(f"-------------------- Done training {pair} " diff --git a/freqtrade/freqai/base_models/BasePyTorchRegressor.py b/freqtrade/freqai/base_models/BasePyTorchRegressor.py index 2f2aaef39..8b304fce4 100644 --- a/freqtrade/freqai/base_models/BasePyTorchRegressor.py +++ b/freqtrade/freqai/base_models/BasePyTorchRegressor.py @@ -18,6 +18,7 @@ class BasePyTorchRegressor(BasePyTorchModel): A PyTorch implementation of a regressor. User must implement fit method """ + def __init__(self, **kwargs): super().__init__(**kwargs) @@ -81,34 +82,33 @@ class BasePyTorchRegressor(BasePyTorchModel): ) # split data into train/test data. - d = dk.make_train_test_datasets(features_filtered, labels_filtered) - if not self.freqai_info.get("fit_live_predictions", 0) or not self.live: + dd = dk.make_train_test_datasets(features_filtered, labels_filtered) + if not self.freqai_info.get("fit_live_predictions_candles", 0) or not self.live: dk.fit_labels() + dk.feature_pipeline = self.define_data_pipeline() + dk.label_pipeline = self.define_label_pipeline() - self.define_data_pipeline(dk) - self.define_label_pipeline(dk) + dd["train_labels"], _, _ = dk.label_pipeline.fit_transform(dd["train_labels"]) + dd["test_labels"], _, _ = dk.label_pipeline.transform(dd["test_labels"]) - d["train_labels"], _, _ = dk.label_pipeline.fit_transform(d["train_labels"]) - d["test_labels"], _, _ = dk.label_pipeline.transform(d["test_labels"]) + (dd["train_features"], + dd["train_labels"], + dd["train_weights"]) = dk.feature_pipeline.fit_transform(dd["train_features"], + dd["train_labels"], + dd["train_weights"]) - (d["train_features"], - d["train_labels"], - d["train_weights"]) = dk.feature_pipeline.fit_transform(d["train_features"], - d["train_labels"], - d["train_weights"]) - - (d["test_features"], - d["test_labels"], - d["test_weights"]) = dk.feature_pipeline.transform(d["test_features"], - d["test_labels"], - d["test_weights"]) + (dd["test_features"], + dd["test_labels"], + dd["test_weights"]) = dk.feature_pipeline.transform(dd["test_features"], + dd["test_labels"], + dd["test_weights"]) logger.info( f"Training model on {len(dk.data_dictionary['train_features'].columns)} features" ) - logger.info(f"Training model on {len(d['train_features'])} data points") + logger.info(f"Training model on {len(dd['train_features'])} data points") - model = self.fit(d, dk) + model = self.fit(dd, dk) end_time = time() logger.info(f"-------------------- Done training {pair} " diff --git a/freqtrade/freqai/base_models/BaseRegressionModel.py b/freqtrade/freqai/base_models/BaseRegressionModel.py index 1babd5f0c..d86b21107 100644 --- a/freqtrade/freqai/base_models/BaseRegressionModel.py +++ b/freqtrade/freqai/base_models/BaseRegressionModel.py @@ -49,34 +49,33 @@ class BaseRegressionModel(IFreqaiModel): logger.info(f"-------------------- Training on data from {start_date} to " f"{end_date} --------------------") # split data into train/test data. - d = dk.make_train_test_datasets(features_filtered, labels_filtered) + dd = dk.make_train_test_datasets(features_filtered, labels_filtered) if not self.freqai_info.get("fit_live_predictions_candles", 0) or not self.live: dk.fit_labels() + dk.feature_pipeline = self.define_data_pipeline() + dk.label_pipeline = self.define_label_pipeline() - self.define_data_pipeline(dk) - self.define_label_pipeline(dk) + (dd["train_features"], + dd["train_labels"], + dd["train_weights"]) = dk.feature_pipeline.fit_transform(dd["train_features"], + dd["train_labels"], + dd["train_weights"]) - (d["train_features"], - d["train_labels"], - d["train_weights"]) = dk.feature_pipeline.fit_transform(d["train_features"], - d["train_labels"], - d["train_weights"]) + (dd["test_features"], + dd["test_labels"], + dd["test_weights"]) = dk.feature_pipeline.transform(dd["test_features"], + dd["test_labels"], + dd["test_weights"]) - (d["test_features"], - d["test_labels"], - d["test_weights"]) = dk.feature_pipeline.transform(d["test_features"], - d["test_labels"], - d["test_weights"]) - - d["train_labels"], _, _ = dk.label_pipeline.fit_transform(d["train_labels"]) - d["test_labels"], _, _ = dk.label_pipeline.transform(d["test_labels"]) + dd["train_labels"], _, _ = dk.label_pipeline.fit_transform(dd["train_labels"]) + dd["test_labels"], _, _ = dk.label_pipeline.transform(dd["test_labels"]) logger.info( f"Training model on {len(dk.data_dictionary['train_features'].columns)} features" ) - logger.info(f"Training model on {len(d['train_features'])} data points") + logger.info(f"Training model on {len(dd['train_features'])} data points") - model = self.fit(d, dk) + model = self.fit(dd, dk) end_time = time() diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py index 632266b00..a98bd92b5 100644 --- a/freqtrade/freqai/freqai_interface.py +++ b/freqtrade/freqai/freqai_interface.py @@ -507,43 +507,47 @@ class IFreqaiModel(ABC): "feature_engineering_* functions" ) - def define_data_pipeline(self, dk: FreqaiDataKitchen) -> None: + def define_data_pipeline(self) -> Pipeline: ft_params = self.freqai_info["feature_parameters"] - dk.feature_pipeline = Pipeline([ + feature_pipeline = Pipeline([ ('const', ds.VarianceThreshold(threshold=0)), ('scaler', SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1)))) ]) if ft_params.get("principal_component_analysis", False): - dk.feature_pipeline.append(('pca', ds.PCA())) - dk.feature_pipeline.append(('post-pca-scaler', - SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1))))) + feature_pipeline.append(('pca', ds.PCA())) + feature_pipeline.append(('post-pca-scaler', + SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1))))) if ft_params.get("use_SVM_to_remove_outliers", False): svm_params = ft_params.get( "svm_params", {"shuffle": False, "nu": 0.01}) - dk.feature_pipeline.append(('svm', ds.SVMOutlierExtractor(**svm_params))) + feature_pipeline.append(('svm', ds.SVMOutlierExtractor(**svm_params))) di = ft_params.get("DI_threshold", 0) if di: - dk.feature_pipeline.append(('di', ds.DissimilarityIndex(di_threshold=di))) + feature_pipeline.append(('di', ds.DissimilarityIndex(di_threshold=di))) if ft_params.get("use_DBSCAN_to_remove_outliers", False): - dk.feature_pipeline.append(('dbscan', ds.DBSCAN())) + feature_pipeline.append(('dbscan', ds.DBSCAN())) sigma = self.freqai_info["feature_parameters"].get('noise_standard_deviation', 0) if sigma: - dk.feature_pipeline.append(('noise', ds.Noise(sigma=sigma))) + feature_pipeline.append(('noise', ds.Noise(sigma=sigma))) - dk.feature_pipeline.fitparams = dk.feature_pipeline._validate_fitparams( - {}, dk.feature_pipeline.steps) + feature_pipeline.fitparams = feature_pipeline._validate_fitparams( + {}, feature_pipeline.steps) - def define_label_pipeline(self, dk: FreqaiDataKitchen) -> None: + return feature_pipeline - dk.label_pipeline = Pipeline([ + def define_label_pipeline(self) -> Pipeline: + + label_pipeline = Pipeline([ ('scaler', SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1)))) ]) + return label_pipeline + def model_exists(self, dk: FreqaiDataKitchen) -> bool: """ Given a pair and path, check if a model already exists From 6d39adc7391e345fb73bea9ce9baa8991e8f597d Mon Sep 17 00:00:00 2001 From: robcaulk Date: Wed, 7 Jun 2023 18:29:49 +0200 Subject: [PATCH 12/32] bump datasieve version --- requirements-freqai.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-freqai.txt b/requirements-freqai.txt index a515ba2b5..94c6e2662 100644 --- a/requirements-freqai.txt +++ b/requirements-freqai.txt @@ -10,4 +10,4 @@ catboost==1.2; 'arm' not in platform_machine and (sys_platform != 'darwin' or py lightgbm==3.3.5 xgboost==1.7.5 tensorboard==2.13.0 -datasieve==0.1.2 +datasieve==0.1.3 From c066f014e35ba89684e116e5dc4358957ca336b9 Mon Sep 17 00:00:00 2001 From: robcaulk Date: Wed, 7 Jun 2023 18:36:07 +0200 Subject: [PATCH 13/32] fix docs --- docs/freqai-feature-engineering.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/freqai-feature-engineering.md b/docs/freqai-feature-engineering.md index 364b920a1..c91e78afc 100644 --- a/docs/freqai-feature-engineering.md +++ b/docs/freqai-feature-engineering.md @@ -257,16 +257,16 @@ Users are encouraged to customize the data pipeline to their needs by building t from datasieve.transforms import SKLearnWrapper, DissimilarityIndex from datasieve.pipeline import Pipeline from sklearn.preprocessing import QuantileTransformer - def define_data_pipeline(self, dk: FreqaiDataKitchen) -> None: + def define_data_pipeline(self) -> Pipeline: """ User defines their custom eature pipeline here (if they wish) """ - dk.feature_pipeline = Pipeline([ + feature_pipeline = Pipeline([ ('qt', SKLearnWrapper(QuantileTransformer(output_distribution='normal'))), ('di', ds.DissimilarityIndex(di_threshold=1) ]) - return + return feature_pipeline ``` Here, you are defining the exact pipeline that will be used for your feature set during training and prediction. Here you can use *most* SKLearn transformation steps by wrapping them in the `SKLearnWrapper` class. From 14557f2d326b5b452fa6182d6b3b2f5f36401135 Mon Sep 17 00:00:00 2001 From: robcaulk Date: Wed, 7 Jun 2023 19:24:21 +0200 Subject: [PATCH 14/32] merge develop into outsource-data-pipeline --- requirements-freqai.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-freqai.txt b/requirements-freqai.txt index 94c6e2662..424c74a9e 100644 --- a/requirements-freqai.txt +++ b/requirements-freqai.txt @@ -10,4 +10,4 @@ catboost==1.2; 'arm' not in platform_machine and (sys_platform != 'darwin' or py lightgbm==3.3.5 xgboost==1.7.5 tensorboard==2.13.0 -datasieve==0.1.3 +datasieve==0.1.4 From e39e40dc60599f26cb1719837fa168e738fbc6ed Mon Sep 17 00:00:00 2001 From: robcaulk Date: Thu, 8 Jun 2023 11:56:31 +0200 Subject: [PATCH 15/32] improve documentation of pipeline building/customization --- docs/freqai-feature-engineering.md | 46 ++++++++++++++----- .../prediction_models/XGBoostRegressor.py | 3 -- 2 files changed, 34 insertions(+), 15 deletions(-) diff --git a/docs/freqai-feature-engineering.md b/docs/freqai-feature-engineering.md index c91e78afc..1151f01a3 100644 --- a/docs/freqai-feature-engineering.md +++ b/docs/freqai-feature-engineering.md @@ -221,20 +221,20 @@ where $W_i$ is the weight of data point $i$ in a total set of $n$ data points. B # Building the data pipeline -FreqAI uses the the [`DataSieve`](https://github.com/emergentmethods/datasieve) pipeline, which follows the SKlearn pipeline API, but adds, among other features, coherence between the X, y, and sample_weight vector point removals, and feature removal feature name following. +FreqAI uses the the [`DataSieve`](https://github.com/emergentmethods/datasieve) pipeline, which follows the SKlearn pipeline API, but adds, among other features, coherence between the X, y, and sample_weight vector point removals, feature removal, feature name following. -This means that users can use/customize any SKLearn modules and easily add them to their FreqAI data pipeline. By default, FreqAI builds the following pipeline: +By default, FreqAI builds the following pipeline inside the `IFreqaiModel` `train()` method: ```py from datasieve.transforms import SKLearnWrapper, DissimilarityIndex from datasieve.pipeline import Pipeline dk.feature_pipeline = Pipeline([ - ('scaler', SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1))), + ('scaler', SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1)))), ('di', ds.DissimilarityIndex(di_threshold=1)), ]) ``` -But users will find that they can add PCA and other steps just by changing their configuration settings, for example, if you add `"principal_component_analysis": true` to the `feature_parameters` dict in the `freqai` config, then FreqAI will add the PCA step for you resulting in the following pipeline: +But users will find that they can add PCA and other steps just by changing their configuration settings, for example, if you add `"principal_component_analysis": true` to the `feature_parameters` dict in the `freqai` config, then FreqAI will automatically add the PCA step for you resulting in the following pipeline: ```py from datasieve.transforms import SKLearnWrapper, DissimilarityIndex, PCA @@ -251,27 +251,49 @@ The same concept follows if users activate other config options like `"use_SVM_t ## Customizing the pipeline -Users are encouraged to customize the data pipeline to their needs by building their own data pipeline. This can be done by simply setting `dk.feature_pipeline` to their desired `Pipeline` object inside their `IFreqaiModel` `train()` function, or if they prefer not to touch the `train()` function, they can override `define_data_pipeline` in their `IFreqaiModel`: +Users are encouraged to customize the data pipeline to their needs by building their own data pipeline. This can be done by simply setting `dk.feature_pipeline` to their desired `Pipeline` object inside their `IFreqaiModel` `train()` function, or if they prefer not to touch the `train()` function, they can override `define_data_pipeline`/`define_label_pipeline` functions in their `IFreqaiModel`: ```py - from datasieve.transforms import SKLearnWrapper, DissimilarityIndex - from datasieve.pipeline import Pipeline - from sklearn.preprocessing import QuantileTransformer +from datasieve.transforms import SKLearnWrapper, DissimilarityIndex +from datasieve.pipeline import Pipeline +from sklearn.preprocessing import QuantileTransformer, StandardScaler +from freqai.base_models import BaseRegressionModel + + +class MyFreqaiModel(BaseRegressionModel): + """ + Some cool custom model + """ + def fit(self, data_dictionary: Dict, dk: FreqaiDataKitchen, **kwargs) -> Any: + """ + My custom fit function + """ + model = cool_model.fit() + return model + def define_data_pipeline(self) -> Pipeline: """ - User defines their custom eature pipeline here (if they wish) + User defines their custom feature pipeline here (if they wish) """ feature_pipeline = Pipeline([ ('qt', SKLearnWrapper(QuantileTransformer(output_distribution='normal'))), ('di', ds.DissimilarityIndex(di_threshold=1) ]) + return feature_pipeline + + def define_label_pipeline(self) -> Pipeline: + """ + User defines their custom label pipeline here (if they wish) + """ + feature_pipeline = Pipeline([ + ('qt', SKLearnWrapper(StandardScaler())), + ]) + return feature_pipeline ``` -Here, you are defining the exact pipeline that will be used for your feature set during training and prediction. Here you can use *most* SKLearn transformation steps by wrapping them in the `SKLearnWrapper` class. - -As there is the `feature_pipeline`, there also exists a definition for the `label_pipeline` which can be defined the same way as the `feature_pipeline`, by overriding `define_label_pipeline`. +Here, you are defining the exact pipeline that will be used for your feature set during training and prediction. You can use *most* SKLearn transformation steps by wrapping them in the `SKLearnWrapper` class as shown above. ## Outlier detection diff --git a/freqtrade/freqai/prediction_models/XGBoostRegressor.py b/freqtrade/freqai/prediction_models/XGBoostRegressor.py index 19c051b91..c1142191d 100644 --- a/freqtrade/freqai/prediction_models/XGBoostRegressor.py +++ b/freqtrade/freqai/prediction_models/XGBoostRegressor.py @@ -8,9 +8,6 @@ from freqtrade.freqai.data_kitchen import FreqaiDataKitchen from freqtrade.freqai.tensorboard import TBCallback -# from datasieve.pipeline import Pipeline -# from sklearn.preprocessing import QuantileTransformer - logger = logging.getLogger(__name__) From 88337b6c5eef4c8894911a2b48a12250498d1da3 Mon Sep 17 00:00:00 2001 From: robcaulk Date: Thu, 8 Jun 2023 12:19:42 +0200 Subject: [PATCH 16/32] convert to using constants in data_drawer. Remove unneeded check_if_pred_in_spaces function --- freqtrade/freqai/data_drawer.py | 41 ++++++++++++++++++-------------- freqtrade/freqai/data_kitchen.py | 33 +------------------------ 2 files changed, 24 insertions(+), 50 deletions(-) diff --git a/freqtrade/freqai/data_drawer.py b/freqtrade/freqai/data_drawer.py index 067790b9a..2a3ec6dd2 100644 --- a/freqtrade/freqai/data_drawer.py +++ b/freqtrade/freqai/data_drawer.py @@ -27,6 +27,11 @@ from freqtrade.strategy.interface import IStrategy logger = logging.getLogger(__name__) +FEATURE_PIPELINE = "feature_pipeline" +LABEL_PIPELINE = "label_pipeline" +TRAINDF = "trained_df" +METADATA = "metadata" + class pair_info(TypedDict): model_filename: str @@ -424,7 +429,7 @@ class FreqaiDataDrawer: dk.data["training_features_list"] = list(dk.data_dictionary["train_features"].columns) dk.data["label_list"] = dk.label_list - with (save_path / f"{dk.model_filename}_metadata.json").open("w") as fp: + with (save_path / f"{dk.model_filename}_{METADATA}.json").open("w") as fp: rapidjson.dump(dk.data, fp, default=self.np_encoder, number_mode=rapidjson.NM_NATIVE) return @@ -454,19 +459,19 @@ class FreqaiDataDrawer: dk.data["training_features_list"] = dk.training_features_list dk.data["label_list"] = dk.label_list # store the metadata - with (save_path / f"{dk.model_filename}_metadata.json").open("w") as fp: + with (save_path / f"{dk.model_filename}_{METADATA}.json").open("w") as fp: rapidjson.dump(dk.data, fp, default=self.np_encoder, number_mode=rapidjson.NM_NATIVE) # save the pipelines to pickle files - with (save_path / f"{dk.model_filename}_feature_pipeline.pkl").open("wb") as fp: + with (save_path / f"{dk.model_filename}_{FEATURE_PIPELINE}.pkl").open("wb") as fp: cloudpickle.dump(dk.feature_pipeline, fp) - with (save_path / f"{dk.model_filename}_label_pipeline.pkl").open("wb") as fp: + with (save_path / f"{dk.model_filename}_{LABEL_PIPELINE}.pkl").open("wb") as fp: cloudpickle.dump(dk.label_pipeline, fp) # save the train data to file so we can check preds for area of applicability later dk.data_dictionary["train_features"].to_pickle( - save_path / f"{dk.model_filename}_trained_df.pkl" + save_path / f"{dk.model_filename}_{TRAINDF}.pkl" ) dk.data_dictionary["train_dates"].to_pickle( @@ -479,10 +484,10 @@ class FreqaiDataDrawer: if coin not in self.meta_data_dictionary: self.meta_data_dictionary[coin] = {} - self.meta_data_dictionary[coin]["train_df"] = dk.data_dictionary["train_features"] - self.meta_data_dictionary[coin]["meta_data"] = dk.data - self.meta_data_dictionary[coin]["feature_pipeline"] = dk.feature_pipeline - self.meta_data_dictionary[coin]["label_pipeline"] = dk.label_pipeline + self.meta_data_dictionary[coin][TRAINDF] = dk.data_dictionary["train_features"] + self.meta_data_dictionary[coin][METADATA] = dk.data + self.meta_data_dictionary[coin][FEATURE_PIPELINE] = dk.feature_pipeline + self.meta_data_dictionary[coin][LABEL_PIPELINE] = dk.label_pipeline self.save_drawer_to_disk() return @@ -492,7 +497,7 @@ class FreqaiDataDrawer: Load only metadata into datakitchen to increase performance during presaved backtesting (prediction file loading). """ - with (dk.data_path / f"{dk.model_filename}_metadata.json").open("r") as fp: + with (dk.data_path / f"{dk.model_filename}_{METADATA}.json").open("r") as fp: dk.data = rapidjson.load(fp, number_mode=rapidjson.NM_NATIVE) dk.training_features_list = dk.data["training_features_list"] dk.label_list = dk.data["label_list"] @@ -512,20 +517,20 @@ class FreqaiDataDrawer: dk.data_path = Path(self.pair_dict[coin]["data_path"]) if coin in self.meta_data_dictionary: - dk.data = self.meta_data_dictionary[coin]["meta_data"] - dk.data_dictionary["train_features"] = self.meta_data_dictionary[coin]["train_df"] - dk.feature_pipeline = self.meta_data_dictionary[coin]["feature_pipeline"] - dk.label_pipeline = self.meta_data_dictionary[coin]["label_pipeline"] + dk.data = self.meta_data_dictionary[coin][METADATA] + dk.data_dictionary["train_features"] = self.meta_data_dictionary[coin][TRAINDF] + dk.feature_pipeline = self.meta_data_dictionary[coin][FEATURE_PIPELINE] + dk.label_pipeline = self.meta_data_dictionary[coin][LABEL_PIPELINE] else: - with (dk.data_path / f"{dk.model_filename}_metadata.json").open("r") as fp: + with (dk.data_path / f"{dk.model_filename}_{METADATA}.json").open("r") as fp: dk.data = rapidjson.load(fp, number_mode=rapidjson.NM_NATIVE) dk.data_dictionary["train_features"] = pd.read_pickle( - dk.data_path / f"{dk.model_filename}_trained_df.pkl" + dk.data_path / f"{dk.model_filename}_{TRAINDF}.pkl" ) - with (dk.data_path / f"{dk.model_filename}_feature_pipeline.pkl").open("rb") as fp: + with (dk.data_path / f"{dk.model_filename}_{FEATURE_PIPELINE}.pkl").open("rb") as fp: dk.feature_pipeline = cloudpickle.load(fp) - with (dk.data_path / f"{dk.model_filename}_label_pipeline.pkl").open("rb") as fp: + with (dk.data_path / f"{dk.model_filename}_{LABEL_PIPELINE}.pkl").open("rb") as fp: dk.label_pipeline = cloudpickle.load(fp) dk.training_features_list = dk.data["training_features_list"] diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index ecdb2e109..de07865d3 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -13,7 +13,6 @@ import pandas as pd import psutil from datasieve.pipeline import Pipeline from pandas import DataFrame -from sklearn.metrics.pairwise import pairwise_distances from sklearn.model_selection import train_test_split from freqtrade.configuration import TimeRange @@ -82,6 +81,7 @@ class FreqaiDataKitchen: self.backtest_live_models = config.get("freqai_backtest_live_models", False) self.feature_pipeline = Pipeline() self.label_pipeline = Pipeline() + self.DI_values: npt.NDArray = np.array([]) if not self.live: self.full_path = self.get_full_models_path(self.config) @@ -391,37 +391,6 @@ class FreqaiDataKitchen: labels = [c for c in column_names if "&" in c] self.label_list = labels - def check_if_pred_in_training_spaces(self) -> None: - """ - Compares the distance from each prediction point to each training data - point. It uses this information to estimate a Dissimilarity Index (DI) - and avoid making predictions on any points that are too far away - from the training data set. - """ - - distance = pairwise_distances( - self.data_dictionary["train_features"], - self.data_dictionary["prediction_features"], - n_jobs=self.thread_count, - ) - - self.DI_values = distance.min(axis=0) / self.data["avg_mean_dist"] - - do_predict = np.where( - self.DI_values < self.freqai_config["feature_parameters"]["DI_threshold"], - 1, - 0, - ) - - if (len(do_predict) - do_predict.sum()) > 0: - logger.info( - f"{self.pair}: DI tossed {len(do_predict) - do_predict.sum()} predictions for " - "being too far from training data." - ) - - self.do_predict += do_predict - self.do_predict -= 1 - def set_weights_higher_recent(self, num_weights: int) -> npt.ArrayLike: """ Set weights so that recent data is more heavily weighted during From 33b028b104ea56233d7ebbf61d3c8fa27ed6afad Mon Sep 17 00:00:00 2001 From: robcaulk Date: Thu, 8 Jun 2023 12:33:08 +0200 Subject: [PATCH 17/32] ensure data kitchen thread count is propagated to pipeline --- .../RL/BaseReinforcementLearningModel.py | 4 ++-- .../freqai/base_models/BaseClassifierModel.py | 2 +- .../base_models/BasePyTorchClassifier.py | 2 +- .../base_models/BasePyTorchRegressor.py | 4 ++-- .../freqai/base_models/BaseRegressionModel.py | 4 ++-- freqtrade/freqai/freqai_interface.py | 8 ++++---- .../prediction_models/XGBoostRegressor.py | 20 ------------------- 7 files changed, 12 insertions(+), 32 deletions(-) diff --git a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py index 90e60ec5c..cffab602d 100644 --- a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py +++ b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py @@ -118,8 +118,8 @@ class BaseReinforcementLearningModel(IFreqaiModel): # normalize all data based on train_dataset only prices_train, prices_test = self.build_ohlc_price_dataframes(dk.data_dictionary, pair, dk) - dk.feature_pipeline = self.define_data_pipeline() - dk.label_pipeline = self.define_label_pipeline() + dk.feature_pipeline = self.define_data_pipeline(threads=dk.thread_count) + dk.label_pipeline = self.define_label_pipeline(threads=dk.thread_count) (dd["train_features"], dd["train_labels"], diff --git a/freqtrade/freqai/base_models/BaseClassifierModel.py b/freqtrade/freqai/base_models/BaseClassifierModel.py index 8495cd9b9..2df639b55 100644 --- a/freqtrade/freqai/base_models/BaseClassifierModel.py +++ b/freqtrade/freqai/base_models/BaseClassifierModel.py @@ -53,7 +53,7 @@ class BaseClassifierModel(IFreqaiModel): dd = dk.make_train_test_datasets(features_filtered, labels_filtered) if not self.freqai_info.get("fit_live_predictions_candles", 0) or not self.live: dk.fit_labels() - dk.feature_pipeline = self.define_data_pipeline() + dk.feature_pipeline = self.define_data_pipeline(threads=dk.thread_count) (dd["train_features"], dd["train_labels"], diff --git a/freqtrade/freqai/base_models/BasePyTorchClassifier.py b/freqtrade/freqai/base_models/BasePyTorchClassifier.py index 85328aa41..57f31629a 100644 --- a/freqtrade/freqai/base_models/BasePyTorchClassifier.py +++ b/freqtrade/freqai/base_models/BasePyTorchClassifier.py @@ -189,7 +189,7 @@ class BasePyTorchClassifier(BasePyTorchModel): if not self.freqai_info.get("fit_live_predictions_candles", 0) or not self.live: dk.fit_labels() - dk.feature_pipeline = self.define_data_pipeline() + dk.feature_pipeline = self.define_data_pipeline(threads=dk.thread_count) (dd["train_features"], dd["train_labels"], diff --git a/freqtrade/freqai/base_models/BasePyTorchRegressor.py b/freqtrade/freqai/base_models/BasePyTorchRegressor.py index 8b304fce4..ec4d6b80c 100644 --- a/freqtrade/freqai/base_models/BasePyTorchRegressor.py +++ b/freqtrade/freqai/base_models/BasePyTorchRegressor.py @@ -85,8 +85,8 @@ class BasePyTorchRegressor(BasePyTorchModel): dd = dk.make_train_test_datasets(features_filtered, labels_filtered) if not self.freqai_info.get("fit_live_predictions_candles", 0) or not self.live: dk.fit_labels() - dk.feature_pipeline = self.define_data_pipeline() - dk.label_pipeline = self.define_label_pipeline() + dk.feature_pipeline = self.define_data_pipeline(threads=dk.thread_count) + dk.label_pipeline = self.define_label_pipeline(threads=dk.thread_count) dd["train_labels"], _, _ = dk.label_pipeline.fit_transform(dd["train_labels"]) dd["test_labels"], _, _ = dk.label_pipeline.transform(dd["test_labels"]) diff --git a/freqtrade/freqai/base_models/BaseRegressionModel.py b/freqtrade/freqai/base_models/BaseRegressionModel.py index d86b21107..d7e7d9916 100644 --- a/freqtrade/freqai/base_models/BaseRegressionModel.py +++ b/freqtrade/freqai/base_models/BaseRegressionModel.py @@ -52,8 +52,8 @@ class BaseRegressionModel(IFreqaiModel): dd = dk.make_train_test_datasets(features_filtered, labels_filtered) if not self.freqai_info.get("fit_live_predictions_candles", 0) or not self.live: dk.fit_labels() - dk.feature_pipeline = self.define_data_pipeline() - dk.label_pipeline = self.define_label_pipeline() + dk.feature_pipeline = self.define_data_pipeline(threads=dk.thread_count) + dk.label_pipeline = self.define_label_pipeline(threads=dk.thread_count) (dd["train_features"], dd["train_labels"], diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py index a98bd92b5..87f682ad3 100644 --- a/freqtrade/freqai/freqai_interface.py +++ b/freqtrade/freqai/freqai_interface.py @@ -507,7 +507,7 @@ class IFreqaiModel(ABC): "feature_engineering_* functions" ) - def define_data_pipeline(self) -> Pipeline: + def define_data_pipeline(self, threads=-1) -> Pipeline: ft_params = self.freqai_info["feature_parameters"] feature_pipeline = Pipeline([ ('const', ds.VarianceThreshold(threshold=0)), @@ -526,10 +526,10 @@ class IFreqaiModel(ABC): di = ft_params.get("DI_threshold", 0) if di: - feature_pipeline.append(('di', ds.DissimilarityIndex(di_threshold=di))) + feature_pipeline.append(('di', ds.DissimilarityIndex(di_threshold=di, n_jobs=threads))) if ft_params.get("use_DBSCAN_to_remove_outliers", False): - feature_pipeline.append(('dbscan', ds.DBSCAN())) + feature_pipeline.append(('dbscan', ds.DBSCAN(n_jobs=threads))) sigma = self.freqai_info["feature_parameters"].get('noise_standard_deviation', 0) if sigma: @@ -540,7 +540,7 @@ class IFreqaiModel(ABC): return feature_pipeline - def define_label_pipeline(self) -> Pipeline: + def define_label_pipeline(self, threads=-1) -> Pipeline: label_pipeline = Pipeline([ ('scaler', SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1)))) diff --git a/freqtrade/freqai/prediction_models/XGBoostRegressor.py b/freqtrade/freqai/prediction_models/XGBoostRegressor.py index c1142191d..f8b4d353d 100644 --- a/freqtrade/freqai/prediction_models/XGBoostRegressor.py +++ b/freqtrade/freqai/prediction_models/XGBoostRegressor.py @@ -52,23 +52,3 @@ class XGBoostRegressor(BaseRegressionModel): model.set_params(callbacks=[]) return model - - # def define_data_pipeline(self, dk: FreqaiDataKitchen) -> None: - # """ - # User defines their custom eature pipeline here (if they wish) - # """ - # dk.feature_pipeline = Pipeline([ - # ('qt', SKLearnWrapper(QuantileTransformer(output_distribution='normal'))) - # ]) - - # return - - # def define_label_pipeline(self, dk: FreqaiDataKitchen) -> None: - # """ - # User defines their custom label pipeline here (if they wish) - # """ - # dk.label_pipeline = Pipeline([ - # ('qt', SKLearnWrapper(QuantileTransformer(output_distribution='normal'))) - # ]) - - # return From e246259792dbdd309d3fdeb1f0fa8ae01ed682f3 Mon Sep 17 00:00:00 2001 From: robcaulk Date: Sat, 10 Jun 2023 11:40:57 +0200 Subject: [PATCH 18/32] avoid manual pipeline validation --- docs/freqai-feature-engineering.md | 67 ++++++++++++++++------------ freqtrade/freqai/freqai_interface.py | 23 +++++----- 2 files changed, 48 insertions(+), 42 deletions(-) diff --git a/docs/freqai-feature-engineering.md b/docs/freqai-feature-engineering.md index 1151f01a3..6e3e7fda6 100644 --- a/docs/freqai-feature-engineering.md +++ b/docs/freqai-feature-engineering.md @@ -221,39 +221,20 @@ where $W_i$ is the weight of data point $i$ in a total set of $n$ data points. B # Building the data pipeline -FreqAI uses the the [`DataSieve`](https://github.com/emergentmethods/datasieve) pipeline, which follows the SKlearn pipeline API, but adds, among other features, coherence between the X, y, and sample_weight vector point removals, feature removal, feature name following. +By default, FreqAI builds a dynamic pipeline based on user congfiguration settings. The default settings are robust and designed to work with a variety of methods. These two steps are a `MinMaxScaler(-1,1)` and a `VarianceThreshold` which removes any column that has 0 variance. Users can activate other steps with more configuration parameters. For example if users add `use_SVM_to_remove_outliers: true` to the `freqai` config, then FreqAI will automatically add the [`SVMOutlierExtractor`](#identifying-outliers-using-a-support-vector-machine-svm) to the pipeline. Likewise, users can add `principal_component_analysis: true` to the `freqai` config to activate PCA. The [DissimilarityIndex](#identifying-outliers-with-the-dissimilarity-index-di) is activated with `DI_threshold: 1`. Finally, noise can also be added to the data with `noise_standard_deviation: 0.1`. Finally, users can add [DBSCAN](#identifying-outliers-with-dbscan) outlier removal with `use_DBSCAN_to_remove_outliers: true`. -By default, FreqAI builds the following pipeline inside the `IFreqaiModel` `train()` method: +!!! note "More information available" + Please review the [parameter table](freqai-parameter-table.md) for more information on these parameters. -```py -from datasieve.transforms import SKLearnWrapper, DissimilarityIndex -from datasieve.pipeline import Pipeline -dk.feature_pipeline = Pipeline([ - ('scaler', SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1)))), - ('di', ds.DissimilarityIndex(di_threshold=1)), - ]) -``` - -But users will find that they can add PCA and other steps just by changing their configuration settings, for example, if you add `"principal_component_analysis": true` to the `feature_parameters` dict in the `freqai` config, then FreqAI will automatically add the PCA step for you resulting in the following pipeline: - -```py -from datasieve.transforms import SKLearnWrapper, DissimilarityIndex, PCA -from datasieve.pipeline import Pipeline -dk.feature_pipeline = Pipeline([ - ('scaler', SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1)))), - ('pca', ds.PCA()), - ('post-pca-scaler', ds.MinMaxScaler(feature_range=(-1, 1))) - ('di', ds.DissimilarityIndex(di_threshold=1)), - ]) -``` - -The same concept follows if users activate other config options like `"use_SVM_to_remove_outliers": true` or `"use_DBSCAN_to_remove_outliers": true`. FreqAI will add the appropriate steps to the pipeline for you. ## Customizing the pipeline Users are encouraged to customize the data pipeline to their needs by building their own data pipeline. This can be done by simply setting `dk.feature_pipeline` to their desired `Pipeline` object inside their `IFreqaiModel` `train()` function, or if they prefer not to touch the `train()` function, they can override `define_data_pipeline`/`define_label_pipeline` functions in their `IFreqaiModel`: -```py +!!! note "More information available" + FreqAI uses the the [`DataSieve`](https://github.com/emergentmethods/datasieve) pipeline, which follows the SKlearn pipeline API, but adds, among other features, coherence between the X, y, and sample_weight vector point removals, feature removal, feature name following. + +```python from datasieve.transforms import SKLearnWrapper, DissimilarityIndex from datasieve.pipeline import Pipeline from sklearn.preprocessing import QuantileTransformer, StandardScaler @@ -286,14 +267,42 @@ class MyFreqaiModel(BaseRegressionModel): """ User defines their custom label pipeline here (if they wish) """ - feature_pipeline = Pipeline([ + label_pipeline = Pipeline([ ('qt', SKLearnWrapper(StandardScaler())), ]) - return feature_pipeline + return label_pipeline ``` -Here, you are defining the exact pipeline that will be used for your feature set during training and prediction. You can use *most* SKLearn transformation steps by wrapping them in the `SKLearnWrapper` class as shown above. +Here, you are defining the exact pipeline that will be used for your feature set during training and prediction. You can use *most* SKLearn transformation steps by wrapping them in the `SKLearnWrapper` class as shown above. In addition, you can use any of the transformations available in the [`DataSieve` library](https://github.com/emergentmethods/datasieve). + +You can easily add your own transformation by creating a class that inherits from the datasieve `BaseTransform` and implementing your `fit()`, `transform()` and `inverse_transform()` methods: + +```python +from datasieve.transforms.base_transform import BaseTransform +# import whatever else you need + +class MyCoolTransform(BaseTransform): + def __init__(self, **kwargs): + self.param1 = kwargs.get('param1', 1) + + def fit(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): + # do something with X, y, sample_weight, or/and feature_list + return X, y, sample_weight, feature_list + + def transform(self, X, y=None, sample_weight=None, + feature_list=None, outlier_check=False, **kwargs): + # do something with X, y, sample_weight, or/and feature_list + return X, y, sample_weight, feature_list + + def inverse_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs): + # do/dont do something with X, y, sample_weight, or/and feature_list + return X, y, sample_weight, feature_list +``` + +!!! note "Hint" + You can define this custom class in the same file as your `IFreqaiModel`. + ## Outlier detection diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py index 87f682ad3..104fcb24d 100644 --- a/freqtrade/freqai/freqai_interface.py +++ b/freqtrade/freqai/freqai_interface.py @@ -509,36 +509,33 @@ class IFreqaiModel(ABC): def define_data_pipeline(self, threads=-1) -> Pipeline: ft_params = self.freqai_info["feature_parameters"] - feature_pipeline = Pipeline([ + pipe_steps = [ ('const', ds.VarianceThreshold(threshold=0)), ('scaler', SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1)))) - ]) + ] if ft_params.get("principal_component_analysis", False): - feature_pipeline.append(('pca', ds.PCA())) - feature_pipeline.append(('post-pca-scaler', - SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1))))) + pipe_steps.append(('pca', ds.PCA())) + pipe_steps.append(('post-pca-scaler', + SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1))))) if ft_params.get("use_SVM_to_remove_outliers", False): svm_params = ft_params.get( "svm_params", {"shuffle": False, "nu": 0.01}) - feature_pipeline.append(('svm', ds.SVMOutlierExtractor(**svm_params))) + pipe_steps.append(('svm', ds.SVMOutlierExtractor(**svm_params))) di = ft_params.get("DI_threshold", 0) if di: - feature_pipeline.append(('di', ds.DissimilarityIndex(di_threshold=di, n_jobs=threads))) + pipe_steps.append(('di', ds.DissimilarityIndex(di_threshold=di, n_jobs=threads))) if ft_params.get("use_DBSCAN_to_remove_outliers", False): - feature_pipeline.append(('dbscan', ds.DBSCAN(n_jobs=threads))) + pipe_steps.append(('dbscan', ds.DBSCAN(n_jobs=threads))) sigma = self.freqai_info["feature_parameters"].get('noise_standard_deviation', 0) if sigma: - feature_pipeline.append(('noise', ds.Noise(sigma=sigma))) + pipe_steps.append(('noise', ds.Noise(sigma=sigma))) - feature_pipeline.fitparams = feature_pipeline._validate_fitparams( - {}, feature_pipeline.steps) - - return feature_pipeline + return Pipeline(pipe_steps) def define_label_pipeline(self, threads=-1) -> Pipeline: From 4cdd6bc6c34c24c00aff2850f9ef5883495d3527 Mon Sep 17 00:00:00 2001 From: robcaulk Date: Sat, 10 Jun 2023 12:07:03 +0200 Subject: [PATCH 19/32] avoid using ram for unnecessary train_df, fix some deprecation warnings --- .../freqai/base_models/FreqaiMultiOutputClassifier.py | 3 +-- freqtrade/freqai/base_models/FreqaiMultiOutputRegressor.py | 3 +-- freqtrade/freqai/data_drawer.py | 7 +------ 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/freqtrade/freqai/base_models/FreqaiMultiOutputClassifier.py b/freqtrade/freqai/base_models/FreqaiMultiOutputClassifier.py index 435c0e646..4646bb9a8 100644 --- a/freqtrade/freqai/base_models/FreqaiMultiOutputClassifier.py +++ b/freqtrade/freqai/base_models/FreqaiMultiOutputClassifier.py @@ -1,9 +1,8 @@ import numpy as np -from joblib import Parallel from sklearn.base import is_classifier from sklearn.multioutput import MultiOutputClassifier, _fit_estimator -from sklearn.utils.fixes import delayed from sklearn.utils.multiclass import check_classification_targets +from sklearn.utils.parallel import Parallel, delayed from sklearn.utils.validation import has_fit_parameter from freqtrade.exceptions import OperationalException diff --git a/freqtrade/freqai/base_models/FreqaiMultiOutputRegressor.py b/freqtrade/freqai/base_models/FreqaiMultiOutputRegressor.py index 54136d5e0..a6cc4f39b 100644 --- a/freqtrade/freqai/base_models/FreqaiMultiOutputRegressor.py +++ b/freqtrade/freqai/base_models/FreqaiMultiOutputRegressor.py @@ -1,6 +1,5 @@ -from joblib import Parallel from sklearn.multioutput import MultiOutputRegressor, _fit_estimator -from sklearn.utils.fixes import delayed +from sklearn.utils.parallel import Parallel, delayed from sklearn.utils.validation import has_fit_parameter diff --git a/freqtrade/freqai/data_drawer.py b/freqtrade/freqai/data_drawer.py index 2a3ec6dd2..edd9640c9 100644 --- a/freqtrade/freqai/data_drawer.py +++ b/freqtrade/freqai/data_drawer.py @@ -469,7 +469,7 @@ class FreqaiDataDrawer: with (save_path / f"{dk.model_filename}_{LABEL_PIPELINE}.pkl").open("wb") as fp: cloudpickle.dump(dk.label_pipeline, fp) - # save the train data to file so we can check preds for area of applicability later + # save the train data to file for post processing if desired dk.data_dictionary["train_features"].to_pickle( save_path / f"{dk.model_filename}_{TRAINDF}.pkl" ) @@ -484,7 +484,6 @@ class FreqaiDataDrawer: if coin not in self.meta_data_dictionary: self.meta_data_dictionary[coin] = {} - self.meta_data_dictionary[coin][TRAINDF] = dk.data_dictionary["train_features"] self.meta_data_dictionary[coin][METADATA] = dk.data self.meta_data_dictionary[coin][FEATURE_PIPELINE] = dk.feature_pipeline self.meta_data_dictionary[coin][LABEL_PIPELINE] = dk.label_pipeline @@ -518,16 +517,12 @@ class FreqaiDataDrawer: if coin in self.meta_data_dictionary: dk.data = self.meta_data_dictionary[coin][METADATA] - dk.data_dictionary["train_features"] = self.meta_data_dictionary[coin][TRAINDF] dk.feature_pipeline = self.meta_data_dictionary[coin][FEATURE_PIPELINE] dk.label_pipeline = self.meta_data_dictionary[coin][LABEL_PIPELINE] else: with (dk.data_path / f"{dk.model_filename}_{METADATA}.json").open("r") as fp: dk.data = rapidjson.load(fp, number_mode=rapidjson.NM_NATIVE) - dk.data_dictionary["train_features"] = pd.read_pickle( - dk.data_path / f"{dk.model_filename}_{TRAINDF}.pkl" - ) with (dk.data_path / f"{dk.model_filename}_{FEATURE_PIPELINE}.pkl").open("rb") as fp: dk.feature_pipeline = cloudpickle.load(fp) with (dk.data_path / f"{dk.model_filename}_{LABEL_PIPELINE}.pkl").open("rb") as fp: From f8d7c2e21dc11c5e716a431b51fcf4094213d365 Mon Sep 17 00:00:00 2001 From: robcaulk Date: Sat, 10 Jun 2023 12:48:27 +0200 Subject: [PATCH 20/32] add migration guide, add protections and migration assistance --- docs/freqai-feature-engineering.md | 66 ++++++++++++++++++++++++- docs/strategy_migration.md | 74 ++++++++++++++++++++++++++++ freqtrade/freqai/data_kitchen.py | 64 ++++++++++++++++++++++++ freqtrade/freqai/freqai_interface.py | 23 +++++++++ 4 files changed, 225 insertions(+), 2 deletions(-) diff --git a/docs/freqai-feature-engineering.md b/docs/freqai-feature-engineering.md index 6e3e7fda6..12e01e30d 100644 --- a/docs/freqai-feature-engineering.md +++ b/docs/freqai-feature-engineering.md @@ -219,7 +219,7 @@ where $W_i$ is the weight of data point $i$ in a total set of $n$ data points. B ![weight-factor](assets/freqai_weight-factor.jpg) -# Building the data pipeline +## Building the data pipeline By default, FreqAI builds a dynamic pipeline based on user congfiguration settings. The default settings are robust and designed to work with a variety of methods. These two steps are a `MinMaxScaler(-1,1)` and a `VarianceThreshold` which removes any column that has 0 variance. Users can activate other steps with more configuration parameters. For example if users add `use_SVM_to_remove_outliers: true` to the `freqai` config, then FreqAI will automatically add the [`SVMOutlierExtractor`](#identifying-outliers-using-a-support-vector-machine-svm) to the pipeline. Likewise, users can add `principal_component_analysis: true` to the `freqai` config to activate PCA. The [DissimilarityIndex](#identifying-outliers-with-the-dissimilarity-index-di) is activated with `DI_threshold: 1`. Finally, noise can also be added to the data with `noise_standard_deviation: 0.1`. Finally, users can add [DBSCAN](#identifying-outliers-with-dbscan) outlier removal with `use_DBSCAN_to_remove_outliers: true`. @@ -227,7 +227,7 @@ By default, FreqAI builds a dynamic pipeline based on user congfiguration settin Please review the [parameter table](freqai-parameter-table.md) for more information on these parameters. -## Customizing the pipeline +### Customizing the pipeline Users are encouraged to customize the data pipeline to their needs by building their own data pipeline. This can be done by simply setting `dk.feature_pipeline` to their desired `Pipeline` object inside their `IFreqaiModel` `train()` function, or if they prefer not to touch the `train()` function, they can override `define_data_pipeline`/`define_label_pipeline` functions in their `IFreqaiModel`: @@ -303,6 +303,68 @@ class MyCoolTransform(BaseTransform): !!! note "Hint" You can define this custom class in the same file as your `IFreqaiModel`. +### Migrating a custom `IFreqaiModel` to the new Pipeline + +If you have created your own custom `IFreqaiModel` with a custom `train()`/`predict()` function, *and* you still rely on `data_cleaning_train/predict()`, then you will need to migrate to the new pipeline. If your model does *not* rely on `data_cleaning_train/predict()`, then you do not need to worry about this migration. + +The conversion involves first removing `data_cleaning_train/predict()` and replacing them with a `define_data_pipeline()` and `define_label_pipeline()` function to your `IFreqaiModel` class: + +```python +class MyCoolFreqaiModel(BaseRegressionModel): + def train( + self, unfiltered_df: DataFrame, pair: str, dk: FreqaiDataKitchen, **kwargs + ) -> Any: + + # ... your custom stuff + + # Remove these lines + # data_dictionary = dk.make_train_test_datasets(features_filtered, labels_filtered) + # self.data_cleaning_train(dk) + # data_dictionary = dk.normalize_data(data_dictionary) + + # Add these lines. Now we control the pipeline fit/transform ourselves + dd = dk.make_train_test_datasets(features_filtered, labels_filtered) + dk.feature_pipeline = self.define_data_pipeline(threads=dk.thread_count) + dk.label_pipeline = self.define_label_pipeline(threads=dk.thread_count) + + (dd["train_features"], + dd["train_labels"], + dd["train_weights"]) = dk.feature_pipeline.fit_transform(dd["train_features"], + dd["train_labels"], + dd["train_weights"]) + + (dd["test_features"], + dd["test_labels"], + dd["test_weights"]) = dk.feature_pipeline.transform(dd["test_features"], + dd["test_labels"], + dd["test_weights"]) + + dd["train_labels"], _, _ = dk.label_pipeline.fit_transform(dd["train_labels"]) + dd["test_labels"], _, _ = dk.label_pipeline.transform(dd["test_labels"]) + + def predict( + self, unfiltered_df: DataFrame, dk: FreqaiDataKitchen, **kwargs + ) -> Tuple[DataFrame, npt.NDArray[np.int_]]: + + # ... your custom stuff + + # Remove these lines: + # self.data_cleaning_predict(dk) + + # Add these lines: + dk.data_dictionary["prediction_features"], outliers, _ = dk.feature_pipeline.transform( + dk.data_dictionary["prediction_features"], outlier_check=True) + + # Remove this line + # pred_df = dk.denormalize_labels_from_metadata(pred_df) + + # Replace with these lines + pred_df, _, _ = dk.label_pipeline.inverse_transform(pred_df) + if self.freqai_info.get("DI_threshold", 0) > 0: + dk.DI_values = dk.feature_pipeline["di"].di_values + else: + dk.DI_values = np.zeros(len(outliers.index)) + dk.do_predict = outliers.to_numpy() ## Outlier detection diff --git a/docs/strategy_migration.md b/docs/strategy_migration.md index 5ef7a5a4c..4c10fb126 100644 --- a/docs/strategy_migration.md +++ b/docs/strategy_migration.md @@ -728,3 +728,77 @@ Targets now get their own, dedicated method. return dataframe ``` + + +### FreqAI - New data Pipeline + +If you have created your own custom `IFreqaiModel` with a custom `train()`/`predict()` function, *and* you still rely on `data_cleaning_train/predict()`, then you will need to migrate to the new pipeline. If your model does *not* rely on `data_cleaning_train/predict()`, then you do not need to worry about this migration. That means that this migration guide is relevant for a very small percentage of power-users. If you stumbled upon this guide by mistake, feel free to inquire in depth about your problem in the Freqtrade discord server. + +The conversion involves first removing `data_cleaning_train/predict()` and replacing them with a `define_data_pipeline()` and `define_label_pipeline()` function to your `IFreqaiModel` class: + +```python linenums="1" hl_lines="10-13 41-42 48-49" +class MyCoolFreqaiModel(BaseRegressionModel): + """ + Some cool custom IFreqaiModel you made before Freqtrade version 2023.6 + """ + def train( + self, unfiltered_df: DataFrame, pair: str, dk: FreqaiDataKitchen, **kwargs + ) -> Any: + + # ... your custom stuff + + # Remove these lines + # data_dictionary = dk.make_train_test_datasets(features_filtered, labels_filtered) + # self.data_cleaning_train(dk) + # data_dictionary = dk.normalize_data(data_dictionary) + + # Add these lines. Now we control the pipeline fit/transform ourselves + dd = dk.make_train_test_datasets(features_filtered, labels_filtered) + dk.feature_pipeline = self.define_data_pipeline(threads=dk.thread_count) + dk.label_pipeline = self.define_label_pipeline(threads=dk.thread_count) + + (dd["train_features"], + dd["train_labels"], + dd["train_weights"]) = dk.feature_pipeline.fit_transform(dd["train_features"], + dd["train_labels"], + dd["train_weights"]) + + (dd["test_features"], + dd["test_labels"], + dd["test_weights"]) = dk.feature_pipeline.transform(dd["test_features"], + dd["test_labels"], + dd["test_weights"]) + + dd["train_labels"], _, _ = dk.label_pipeline.fit_transform(dd["train_labels"]) + dd["test_labels"], _, _ = dk.label_pipeline.transform(dd["test_labels"]) + + def predict( + self, unfiltered_df: DataFrame, dk: FreqaiDataKitchen, **kwargs + ) -> Tuple[DataFrame, npt.NDArray[np.int_]]: # 37 + + # ... your custom stuff + + # Remove these lines: + # self.data_cleaning_predict(dk) + + # Add these lines: + dk.data_dictionary["prediction_features"], outliers, _ = dk.feature_pipeline.transform( + dk.data_dictionary["prediction_features"], outlier_check=True) + + # Remove this line + # pred_df = dk.denormalize_labels_from_metadata(pred_df) + + # Replace with these lines + pred_df, _, _ = dk.label_pipeline.inverse_transform(pred_df) + if self.freqai_info.get("DI_threshold", 0) > 0: + dk.DI_values = dk.feature_pipeline["di"].di_values + else: + dk.DI_values = np.zeros(len(outliers.index)) + dk.do_predict = outliers.to_numpy() +``` + + +1. Features - Move to `feature_engineering_expand_all` +2. Basic features, not expanded across `include_periods_candles` - move to`feature_engineering_expand_basic()`. +3. Standard features which should not be expanded - move to `feature_engineering_standard()`. +4. Targets - Move this part to `set_freqai_targets()`. diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index de07865d3..215457992 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -12,6 +12,7 @@ import numpy.typing as npt import pandas as pd import psutil from datasieve.pipeline import Pipeline +from datasieve.transforms import SKLearnWrapper from pandas import DataFrame from sklearn.model_selection import train_test_split @@ -950,3 +951,66 @@ class FreqaiDataKitchen: timerange.startts += buffer * timeframe_to_seconds(self.config["timeframe"]) return timerange + + # deprecated functions + def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]: + """ + Deprecation warning, migration assistance + """ + ft = "https://www.freqtrade.io/en/latest" + logger.warning(f"Your custom IFreqaiModel relies on the deprecated" + " data pipeline. Please update your model to use the new data pipeline." + " This can be achieved by following the migration guide at " + f"{ft}/strategy_migration/#freqai-new-data-pipeline " + "We added a basic pipeline for you, but this will be removed " + "in a future version.\n" + "This version does not include any outlier configurations") + + import datasieve.transforms as ds + from sklearn.preprocessing import MinMaxScaler + dd = data_dictionary + + self.feature_pipeline = Pipeline([ + ('variance_threshold', ds.VarianceThreshold()), + ('scaler', SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1)))) + ]) + + (dd["train_features"], + dd["train_labels"], + dd["train_weights"]) = self.feature_pipeline.fit_transform(dd["train_features"], + dd["train_labels"], + dd["train_weights"]) + + (dd["test_features"], + dd["test_labels"], + dd["test_weights"]) = self.feature_pipeline.transform(dd["test_features"], + dd["test_labels"], + dd["test_weights"]) + + self.label_pipeline = Pipeline([ + ('scaler', SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1)))) + ]) + + dd["train_labels"], _, _ = self.label_pipeline.fit_transform(dd["train_labels"]) + dd["test_labels"], _, _ = self.label_pipeline.transform(dd["test_labels"]) + + return dd + + def denormalize_labels_from_metadata(self, df: DataFrame) -> DataFrame: + """ + Deprecation warning, migration assistance + """ + ft = "https://www.freqtrade.io/en/latest" + logger.warning(f"Your custom IFreqaiModel relies on the deprecated" + " data pipeline. Please update your model to use the new data pipeline." + " This can be achieved by following the migration guide at " + f"{ft}/strategy_migration/#freqai-new-data-pipeline " + "We added a basic pipeline for you, but this will be removed " + "in a future version.\n" + "This version does not include any outlier configurations") + + pred_df, _, _ = self.label_pipeline.inverse_transform(df) + self.DI_values = np.zeros(len(pred_df.index)) + self.do_predict = np.ones(len(pred_df.index)) + + return pred_df diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py index 104fcb24d..eff8d4bd5 100644 --- a/freqtrade/freqai/freqai_interface.py +++ b/freqtrade/freqai/freqai_interface.py @@ -968,3 +968,26 @@ class IFreqaiModel(ABC): :do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove data (NaNs) or felt uncertain about data (i.e. SVM and/or DI index) """ + + # deprecated functions + def data_cleaning_train(self, dk: FreqaiDataKitchen, pair: str): + """ + throw deprecation warning if this function is called + """ + ft = "https://www.freqtrade.io/en/latest" + logger.warning(f"Your model {self.__class__.__name__} relies on the deprecated" + " data pipeline. Please update your model to use the new data pipeline." + " This can be achieved by following the migration guide at " + f"{ft}/strategy_migration/#freqai-new-data-pipeline") + return + + def data_cleaning_predict(self, dk: FreqaiDataKitchen, pair: str): + """ + throw deprecation warning if this function is called + """ + ft = "https://www.freqtrade.io/en/latest" + logger.warning(f"Your model {self.__class__.__name__} relies on the deprecated" + " data pipeline. Please update your model to use the new data pipeline." + " This can be achieved by following the migration guide at " + f"{ft}/strategy_migration/#freqai-new-data-pipeline") + return From d9bdd879ab35131aa708a2dce27a447b43451886 Mon Sep 17 00:00:00 2001 From: robcaulk Date: Sat, 10 Jun 2023 13:00:59 +0200 Subject: [PATCH 21/32] improve migration doc --- docs/strategy_migration.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/strategy_migration.md b/docs/strategy_migration.md index 4c10fb126..2fef5e516 100644 --- a/docs/strategy_migration.md +++ b/docs/strategy_migration.md @@ -736,7 +736,7 @@ If you have created your own custom `IFreqaiModel` with a custom `train()`/`pred The conversion involves first removing `data_cleaning_train/predict()` and replacing them with a `define_data_pipeline()` and `define_label_pipeline()` function to your `IFreqaiModel` class: -```python linenums="1" hl_lines="10-13 41-42 48-49" +```python linenums="1" hl_lines="11-14 43-44 51-52" class MyCoolFreqaiModel(BaseRegressionModel): """ Some cool custom IFreqaiModel you made before Freqtrade version 2023.6 @@ -751,6 +751,7 @@ class MyCoolFreqaiModel(BaseRegressionModel): # data_dictionary = dk.make_train_test_datasets(features_filtered, labels_filtered) # self.data_cleaning_train(dk) # data_dictionary = dk.normalize_data(data_dictionary) + # (1) # Add these lines. Now we control the pipeline fit/transform ourselves dd = dk.make_train_test_datasets(features_filtered, labels_filtered) @@ -780,6 +781,7 @@ class MyCoolFreqaiModel(BaseRegressionModel): # Remove these lines: # self.data_cleaning_predict(dk) + # (2) # Add these lines: dk.data_dictionary["prediction_features"], outliers, _ = dk.feature_pipeline.transform( @@ -787,6 +789,7 @@ class MyCoolFreqaiModel(BaseRegressionModel): # Remove this line # pred_df = dk.denormalize_labels_from_metadata(pred_df) + # (3) # Replace with these lines pred_df, _, _ = dk.label_pipeline.inverse_transform(pred_df) @@ -798,7 +801,6 @@ class MyCoolFreqaiModel(BaseRegressionModel): ``` -1. Features - Move to `feature_engineering_expand_all` -2. Basic features, not expanded across `include_periods_candles` - move to`feature_engineering_expand_basic()`. -3. Standard features which should not be expanded - move to `feature_engineering_standard()`. -4. Targets - Move this part to `set_freqai_targets()`. +1. Data normalization and cleaning is now homogenized with the new pipeline definition. This is created in the new `define_data_pipeline()` and `define_label_pipeline()` functions. The `data_cleaning_train()` and `data_cleaning_predict()` functions are no longer used. You can override `define_data_pipeline()` to create your own custom pipeline if you wish. +2. Data normalization and cleaning is now homogenized with the new pipeline definition. This is created in the new `define_data_pipeline()` and `define_label_pipeline()` functions. The `data_cleaning_train()` and `data_cleaning_predict()` functions are no longer used. You can override `define_data_pipeline()` to create your own custom pipeline if you wish. +3. Data denormalization is done with the new pipeline. Replace this with the lines below. From 41e37f9d322d95706c9c11afe7792a544c56d778 Mon Sep 17 00:00:00 2001 From: robcaulk Date: Sat, 10 Jun 2023 13:11:47 +0200 Subject: [PATCH 22/32] improve docs, update doc strings --- docs/strategy_migration.md | 11 +++++++++-- freqtrade/freqai/base_models/BaseClassifierModel.py | 4 ++-- freqtrade/freqai/base_models/BaseRegressionModel.py | 4 ++-- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/docs/strategy_migration.md b/docs/strategy_migration.md index 2fef5e516..353da0ccb 100644 --- a/docs/strategy_migration.md +++ b/docs/strategy_migration.md @@ -736,7 +736,7 @@ If you have created your own custom `IFreqaiModel` with a custom `train()`/`pred The conversion involves first removing `data_cleaning_train/predict()` and replacing them with a `define_data_pipeline()` and `define_label_pipeline()` function to your `IFreqaiModel` class: -```python linenums="1" hl_lines="11-14 43-44 51-52" +```python linenums="1" hl_lines="11-14 47-49 55-57" class MyCoolFreqaiModel(BaseRegressionModel): """ Some cool custom IFreqaiModel you made before Freqtrade version 2023.6 @@ -773,9 +773,13 @@ class MyCoolFreqaiModel(BaseRegressionModel): dd["train_labels"], _, _ = dk.label_pipeline.fit_transform(dd["train_labels"]) dd["test_labels"], _, _ = dk.label_pipeline.transform(dd["test_labels"]) + # ... your custom code + + return model + def predict( self, unfiltered_df: DataFrame, dk: FreqaiDataKitchen, **kwargs - ) -> Tuple[DataFrame, npt.NDArray[np.int_]]: # 37 + ) -> Tuple[DataFrame, npt.NDArray[np.int_]]: # ... your custom stuff @@ -798,6 +802,9 @@ class MyCoolFreqaiModel(BaseRegressionModel): else: dk.DI_values = np.zeros(len(outliers.index)) dk.do_predict = outliers.to_numpy() + + # ... your custom code + return (pred_df, dk.do_predict) ``` diff --git a/freqtrade/freqai/base_models/BaseClassifierModel.py b/freqtrade/freqai/base_models/BaseClassifierModel.py index 2df639b55..e536efea3 100644 --- a/freqtrade/freqai/base_models/BaseClassifierModel.py +++ b/freqtrade/freqai/base_models/BaseClassifierModel.py @@ -17,8 +17,8 @@ logger = logging.getLogger(__name__) class BaseClassifierModel(IFreqaiModel): """ Base class for regression type models (e.g. Catboost, LightGBM, XGboost etc.). - User *must* inherit from this class and set fit() and predict(). See example scripts - such as prediction_models/CatboostPredictionModel.py for guidance. + User *must* inherit from this class and set fit(). See example scripts + such as prediction_models/CatboostClassifier.py for guidance. """ def train( diff --git a/freqtrade/freqai/base_models/BaseRegressionModel.py b/freqtrade/freqai/base_models/BaseRegressionModel.py index d7e7d9916..f1e33bff8 100644 --- a/freqtrade/freqai/base_models/BaseRegressionModel.py +++ b/freqtrade/freqai/base_models/BaseRegressionModel.py @@ -16,8 +16,8 @@ logger = logging.getLogger(__name__) class BaseRegressionModel(IFreqaiModel): """ Base class for regression type models (e.g. Catboost, LightGBM, XGboost etc.). - User *must* inherit from this class and set fit() and predict(). See example scripts - such as prediction_models/CatboostPredictionModel.py for guidance. + User *must* inherit from this class and set fit(). See example scripts + such as prediction_models/CatboostRegressor.py for guidance. """ def train( From 229ee643cdf50c34ff19eea48ce99432f7241c58 Mon Sep 17 00:00:00 2001 From: robcaulk Date: Sat, 10 Jun 2023 13:24:09 +0200 Subject: [PATCH 23/32] revert change to deal with FT pinning old scikit-learn version --- freqtrade/freqai/base_models/FreqaiMultiOutputClassifier.py | 3 ++- freqtrade/freqai/base_models/FreqaiMultiOutputRegressor.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/freqtrade/freqai/base_models/FreqaiMultiOutputClassifier.py b/freqtrade/freqai/base_models/FreqaiMultiOutputClassifier.py index 4646bb9a8..435c0e646 100644 --- a/freqtrade/freqai/base_models/FreqaiMultiOutputClassifier.py +++ b/freqtrade/freqai/base_models/FreqaiMultiOutputClassifier.py @@ -1,8 +1,9 @@ import numpy as np +from joblib import Parallel from sklearn.base import is_classifier from sklearn.multioutput import MultiOutputClassifier, _fit_estimator +from sklearn.utils.fixes import delayed from sklearn.utils.multiclass import check_classification_targets -from sklearn.utils.parallel import Parallel, delayed from sklearn.utils.validation import has_fit_parameter from freqtrade.exceptions import OperationalException diff --git a/freqtrade/freqai/base_models/FreqaiMultiOutputRegressor.py b/freqtrade/freqai/base_models/FreqaiMultiOutputRegressor.py index a6cc4f39b..54136d5e0 100644 --- a/freqtrade/freqai/base_models/FreqaiMultiOutputRegressor.py +++ b/freqtrade/freqai/base_models/FreqaiMultiOutputRegressor.py @@ -1,5 +1,6 @@ +from joblib import Parallel from sklearn.multioutput import MultiOutputRegressor, _fit_estimator -from sklearn.utils.parallel import Parallel, delayed +from sklearn.utils.fixes import delayed from sklearn.utils.validation import has_fit_parameter From ad8a4897cee98d8b3963662aa002164ec493ab8e Mon Sep 17 00:00:00 2001 From: robcaulk Date: Sat, 10 Jun 2023 16:13:28 +0200 Subject: [PATCH 24/32] remove unnecessary example in feature_engineering.md --- docs/freqai-feature-engineering.md | 59 +----------------------------- 1 file changed, 1 insertion(+), 58 deletions(-) diff --git a/docs/freqai-feature-engineering.md b/docs/freqai-feature-engineering.md index 12e01e30d..415ac2c63 100644 --- a/docs/freqai-feature-engineering.md +++ b/docs/freqai-feature-engineering.md @@ -307,64 +307,7 @@ class MyCoolTransform(BaseTransform): If you have created your own custom `IFreqaiModel` with a custom `train()`/`predict()` function, *and* you still rely on `data_cleaning_train/predict()`, then you will need to migrate to the new pipeline. If your model does *not* rely on `data_cleaning_train/predict()`, then you do not need to worry about this migration. -The conversion involves first removing `data_cleaning_train/predict()` and replacing them with a `define_data_pipeline()` and `define_label_pipeline()` function to your `IFreqaiModel` class: - -```python -class MyCoolFreqaiModel(BaseRegressionModel): - def train( - self, unfiltered_df: DataFrame, pair: str, dk: FreqaiDataKitchen, **kwargs - ) -> Any: - - # ... your custom stuff - - # Remove these lines - # data_dictionary = dk.make_train_test_datasets(features_filtered, labels_filtered) - # self.data_cleaning_train(dk) - # data_dictionary = dk.normalize_data(data_dictionary) - - # Add these lines. Now we control the pipeline fit/transform ourselves - dd = dk.make_train_test_datasets(features_filtered, labels_filtered) - dk.feature_pipeline = self.define_data_pipeline(threads=dk.thread_count) - dk.label_pipeline = self.define_label_pipeline(threads=dk.thread_count) - - (dd["train_features"], - dd["train_labels"], - dd["train_weights"]) = dk.feature_pipeline.fit_transform(dd["train_features"], - dd["train_labels"], - dd["train_weights"]) - - (dd["test_features"], - dd["test_labels"], - dd["test_weights"]) = dk.feature_pipeline.transform(dd["test_features"], - dd["test_labels"], - dd["test_weights"]) - - dd["train_labels"], _, _ = dk.label_pipeline.fit_transform(dd["train_labels"]) - dd["test_labels"], _, _ = dk.label_pipeline.transform(dd["test_labels"]) - - def predict( - self, unfiltered_df: DataFrame, dk: FreqaiDataKitchen, **kwargs - ) -> Tuple[DataFrame, npt.NDArray[np.int_]]: - - # ... your custom stuff - - # Remove these lines: - # self.data_cleaning_predict(dk) - - # Add these lines: - dk.data_dictionary["prediction_features"], outliers, _ = dk.feature_pipeline.transform( - dk.data_dictionary["prediction_features"], outlier_check=True) - - # Remove this line - # pred_df = dk.denormalize_labels_from_metadata(pred_df) - - # Replace with these lines - pred_df, _, _ = dk.label_pipeline.inverse_transform(pred_df) - if self.freqai_info.get("DI_threshold", 0) > 0: - dk.DI_values = dk.feature_pipeline["di"].di_values - else: - dk.DI_values = np.zeros(len(outliers.index)) - dk.do_predict = outliers.to_numpy() +More details about the migration can be found [here](strategy_migration.md#freqai---new-data-pipeline). ## Outlier detection From 75ec19062c883da435d07f1f7d879e84f7075d08 Mon Sep 17 00:00:00 2001 From: robcaulk Date: Fri, 16 Jun 2023 13:06:21 +0200 Subject: [PATCH 25/32] chore: make DOCS_LINK in constants.py, ensure datasieve is added to setup.py --- freqtrade/constants.py | 1 + .../RL/BaseReinforcementLearningModel.py | 1 - freqtrade/freqai/data_kitchen.py | 31 +------------------ freqtrade/freqai/freqai_interface.py | 10 +++--- setup.py | 3 +- 5 files changed, 9 insertions(+), 37 deletions(-) diff --git a/freqtrade/constants.py b/freqtrade/constants.py index 7012acb7c..acfca6fa5 100644 --- a/freqtrade/constants.py +++ b/freqtrade/constants.py @@ -8,6 +8,7 @@ from typing import Any, Dict, List, Literal, Tuple from freqtrade.enums import CandleType, PriceType, RPCMessageType +DOCS_LINK = "https://www.freqtrade.io/en/stable" DEFAULT_CONFIG = 'config.json' DEFAULT_EXCHANGE = 'bittrex' PROCESS_THROTTLE_SECS = 5 # sec diff --git a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py index cffab602d..b59c47ad2 100644 --- a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py +++ b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py @@ -119,7 +119,6 @@ class BaseReinforcementLearningModel(IFreqaiModel): prices_train, prices_test = self.build_ohlc_price_dataframes(dk.data_dictionary, pair, dk) dk.feature_pipeline = self.define_data_pipeline(threads=dk.thread_count) - dk.label_pipeline = self.define_label_pipeline(threads=dk.thread_count) (dd["train_features"], dd["train_labels"], diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index 215457992..3f8d0fb4b 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -12,7 +12,6 @@ import numpy.typing as npt import pandas as pd import psutil from datasieve.pipeline import Pipeline -from datasieve.transforms import SKLearnWrapper from pandas import DataFrame from sklearn.model_selection import train_test_split @@ -966,35 +965,7 @@ class FreqaiDataKitchen: "in a future version.\n" "This version does not include any outlier configurations") - import datasieve.transforms as ds - from sklearn.preprocessing import MinMaxScaler - dd = data_dictionary - - self.feature_pipeline = Pipeline([ - ('variance_threshold', ds.VarianceThreshold()), - ('scaler', SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1)))) - ]) - - (dd["train_features"], - dd["train_labels"], - dd["train_weights"]) = self.feature_pipeline.fit_transform(dd["train_features"], - dd["train_labels"], - dd["train_weights"]) - - (dd["test_features"], - dd["test_labels"], - dd["test_weights"]) = self.feature_pipeline.transform(dd["test_features"], - dd["test_labels"], - dd["test_weights"]) - - self.label_pipeline = Pipeline([ - ('scaler', SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1)))) - ]) - - dd["train_labels"], _, _ = self.label_pipeline.fit_transform(dd["train_labels"]) - dd["test_labels"], _, _ = self.label_pipeline.transform(dd["test_labels"]) - - return dd + return data_dictionary def denormalize_labels_from_metadata(self, df: DataFrame) -> DataFrame: """ diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py index eff8d4bd5..a6e5d40ed 100644 --- a/freqtrade/freqai/freqai_interface.py +++ b/freqtrade/freqai/freqai_interface.py @@ -18,7 +18,7 @@ from pandas import DataFrame from sklearn.preprocessing import MinMaxScaler from freqtrade.configuration import TimeRange -from freqtrade.constants import Config +from freqtrade.constants import DOCS_LINK, Config from freqtrade.data.dataprovider import DataProvider from freqtrade.enums import RunMode from freqtrade.exceptions import OperationalException @@ -974,20 +974,20 @@ class IFreqaiModel(ABC): """ throw deprecation warning if this function is called """ - ft = "https://www.freqtrade.io/en/latest" logger.warning(f"Your model {self.__class__.__name__} relies on the deprecated" " data pipeline. Please update your model to use the new data pipeline." " This can be achieved by following the migration guide at " - f"{ft}/strategy_migration/#freqai-new-data-pipeline") + f"{DOCS_LINK}/strategy_migration/#freqai-new-data-pipeline") + dk.feature_pipeline = self.define_data_pipeline(threads=dk.thread_count) return def data_cleaning_predict(self, dk: FreqaiDataKitchen, pair: str): """ throw deprecation warning if this function is called """ - ft = "https://www.freqtrade.io/en/latest" logger.warning(f"Your model {self.__class__.__name__} relies on the deprecated" " data pipeline. Please update your model to use the new data pipeline." " This can be achieved by following the migration guide at " - f"{ft}/strategy_migration/#freqai-new-data-pipeline") + f"{DOCS_LINK}/strategy_migration/#freqai-new-data-pipeline") + dk.label_pipeline = self.define_data_pipeline(threads=dk.thread_count) return diff --git a/setup.py b/setup.py index 106b5b6d3..9a04e07d0 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,8 @@ freqai = [ 'catboost; platform_machine != "aarch64"', 'lightgbm', 'xgboost', - 'tensorboard' + 'tensorboard', + 'datasieve>=0.1.4' ] freqai_rl = [ From 72101f059dad268c7977d5eb2227766d5df86da6 Mon Sep 17 00:00:00 2001 From: robcaulk Date: Fri, 16 Jun 2023 13:20:35 +0200 Subject: [PATCH 26/32] feat: ensure full backwards compatibility --- freqtrade/freqai/data_kitchen.py | 8 ++------ freqtrade/freqai/freqai_interface.py | 26 +++++++++++++++++++++++++- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index 3f8d0fb4b..3a91c8551 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -962,8 +962,7 @@ class FreqaiDataKitchen: " This can be achieved by following the migration guide at " f"{ft}/strategy_migration/#freqai-new-data-pipeline " "We added a basic pipeline for you, but this will be removed " - "in a future version.\n" - "This version does not include any outlier configurations") + "in a future version.") return data_dictionary @@ -977,11 +976,8 @@ class FreqaiDataKitchen: " This can be achieved by following the migration guide at " f"{ft}/strategy_migration/#freqai-new-data-pipeline " "We added a basic pipeline for you, but this will be removed " - "in a future version.\n" - "This version does not include any outlier configurations") + "in a future version.") pred_df, _, _ = self.label_pipeline.inverse_transform(df) - self.DI_values = np.zeros(len(pred_df.index)) - self.do_predict = np.ones(len(pred_df.index)) return pred_df diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py index a6e5d40ed..4ca5467b6 100644 --- a/freqtrade/freqai/freqai_interface.py +++ b/freqtrade/freqai/freqai_interface.py @@ -979,6 +979,23 @@ class IFreqaiModel(ABC): " This can be achieved by following the migration guide at " f"{DOCS_LINK}/strategy_migration/#freqai-new-data-pipeline") dk.feature_pipeline = self.define_data_pipeline(threads=dk.thread_count) + dd = dk.data_dictionary + (dd["train_features"], + dd["train_labels"], + dd["train_weights"]) = dk.feature_pipeline.fit_transform(dd["train_features"], + dd["train_labels"], + dd["train_weights"]) + + (dd["test_features"], + dd["test_labels"], + dd["test_weights"]) = dk.feature_pipeline.transform(dd["test_features"], + dd["test_labels"], + dd["test_weights"]) + + dk.label_pipeline = self.define_label_pipeline(threads=dk.thread_count) + + dd["train_labels"], _, _ = dk.label_pipeline.fit_transform(dd["train_labels"]) + dd["test_labels"], _, _ = dk.label_pipeline.transform(dd["test_labels"]) return def data_cleaning_predict(self, dk: FreqaiDataKitchen, pair: str): @@ -989,5 +1006,12 @@ class IFreqaiModel(ABC): " data pipeline. Please update your model to use the new data pipeline." " This can be achieved by following the migration guide at " f"{DOCS_LINK}/strategy_migration/#freqai-new-data-pipeline") - dk.label_pipeline = self.define_data_pipeline(threads=dk.thread_count) + dd = dk.data_dictionary + dd["predict_features"], outliers, _ = dk.feature_pipeline.transform( + dd["predict_features"], outlier_check=True) + if self.freqai_info.get("DI_threshold", 0) > 0: + dk.DI_values = dk.feature_pipeline["di"].di_values + else: + dk.DI_values = np.zeros(len(outliers.index)) + dk.do_predict = outliers.to_numpy() return From 1567cd28496e765b809568550f17793554d3e70b Mon Sep 17 00:00:00 2001 From: Matthias Date: Sat, 17 Jun 2023 09:10:54 +0200 Subject: [PATCH 27/32] Use DOCS_LINK throughout --- freqtrade/freqai/data_kitchen.py | 12 +++++------- freqtrade/util/binance_mig.py | 4 ++-- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index 3a91c8551..7d4bf39ca 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -16,7 +16,7 @@ from pandas import DataFrame from sklearn.model_selection import train_test_split from freqtrade.configuration import TimeRange -from freqtrade.constants import Config +from freqtrade.constants import DOCS_LINK, Config from freqtrade.data.converter import reduce_dataframe_footprint from freqtrade.exceptions import OperationalException from freqtrade.exchange import timeframe_to_seconds @@ -760,9 +760,9 @@ class FreqaiDataKitchen: " which was deprecated on March 1, 2023. Please refer " "to the strategy migration guide to use the new " "feature_engineering_* methods: \n" - "https://www.freqtrade.io/en/stable/strategy_migration/#freqai-strategy \n" + f"{DOCS_LINK}/strategy_migration/#freqai-strategy \n" "And the feature_engineering_* documentation: \n" - "https://www.freqtrade.io/en/latest/freqai-feature-engineering/" + f"{DOCS_LINK}/freqai-feature-engineering/" ) tfs: List[str] = self.freqai_config["feature_parameters"].get("include_timeframes") @@ -956,11 +956,10 @@ class FreqaiDataKitchen: """ Deprecation warning, migration assistance """ - ft = "https://www.freqtrade.io/en/latest" logger.warning(f"Your custom IFreqaiModel relies on the deprecated" " data pipeline. Please update your model to use the new data pipeline." " This can be achieved by following the migration guide at " - f"{ft}/strategy_migration/#freqai-new-data-pipeline " + f"{DOCS_LINK}/strategy_migration/#freqai-new-data-pipeline " "We added a basic pipeline for you, but this will be removed " "in a future version.") @@ -970,11 +969,10 @@ class FreqaiDataKitchen: """ Deprecation warning, migration assistance """ - ft = "https://www.freqtrade.io/en/latest" logger.warning(f"Your custom IFreqaiModel relies on the deprecated" " data pipeline. Please update your model to use the new data pipeline." " This can be achieved by following the migration guide at " - f"{ft}/strategy_migration/#freqai-new-data-pipeline " + f"{DOCS_LINK}/strategy_migration/#freqai-new-data-pipeline " "We added a basic pipeline for you, but this will be removed " "in a future version.") diff --git a/freqtrade/util/binance_mig.py b/freqtrade/util/binance_mig.py index 37a2d2ef1..9b0f8521f 100644 --- a/freqtrade/util/binance_mig.py +++ b/freqtrade/util/binance_mig.py @@ -3,7 +3,7 @@ import logging from packaging import version from sqlalchemy import select -from freqtrade.constants import Config +from freqtrade.constants import DOCS_LINK, Config from freqtrade.enums.tradingmode import TradingMode from freqtrade.exceptions import OperationalException from freqtrade.persistence.pairlock import PairLock @@ -25,7 +25,7 @@ def migrate_binance_futures_names(config: Config): if version.parse("2.6.26") > version.parse(ccxt.__version__): raise OperationalException( "Please follow the update instructions in the docs " - "(https://www.freqtrade.io/en/latest/updating/) to install a compatible ccxt version.") + f"({DOCS_LINK}/updating/) to install a compatible ccxt version.") _migrate_binance_futures_db(config) migrate_binance_futures_data(config) From 11ff454b3bb371eeb2ea038b49bf8856e045e5ec Mon Sep 17 00:00:00 2001 From: robcaulk Date: Sat, 17 Jun 2023 13:21:31 +0200 Subject: [PATCH 28/32] fix: ensure that a user setting up their own pipeline wont have conflicts with DI_values --- freqtrade/freqai/RL/BaseReinforcementLearningModel.py | 6 +----- freqtrade/freqai/base_models/BasePyTorchRegressor.py | 2 +- freqtrade/freqai/base_models/BaseRegressionModel.py | 2 +- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py index b59c47ad2..81cacc055 100644 --- a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py +++ b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py @@ -250,17 +250,13 @@ class BaseReinforcementLearningModel(IFreqaiModel): dk.data_dictionary["prediction_features"] = self.drop_ohlc_from_df(filtered_dataframe, dk) - dk.data_dictionary["prediction_features"], outliers, _ = dk.feature_pipeline.transform( + dk.data_dictionary["prediction_features"], _, _ = dk.feature_pipeline.transform( dk.data_dictionary["prediction_features"], outlier_check=True) pred_df = self.rl_model_predict( dk.data_dictionary["prediction_features"], dk, self.model) pred_df.fillna(0, inplace=True) - if self.freqai_info.get("DI_threshold", 0) > 0: - dk.DI_values = dk.feature_pipeline["di"].di_values - dk.do_predict = outliers.to_numpy() - return (pred_df, dk.do_predict) def rl_model_predict(self, dataframe: DataFrame, diff --git a/freqtrade/freqai/base_models/BasePyTorchRegressor.py b/freqtrade/freqai/base_models/BasePyTorchRegressor.py index ec4d6b80c..b77fec31a 100644 --- a/freqtrade/freqai/base_models/BasePyTorchRegressor.py +++ b/freqtrade/freqai/base_models/BasePyTorchRegressor.py @@ -52,7 +52,7 @@ class BasePyTorchRegressor(BasePyTorchModel): pred_df = DataFrame(y.detach().tolist(), columns=[dk.label_list[0]]) pred_df, _, _ = dk.label_pipeline.inverse_transform(pred_df) - if self.freqai_info.get("DI_threshold", 0) > 0: + if dk.feature_pipeline["di"]: dk.DI_values = dk.feature_pipeline["di"].di_values else: dk.DI_values = np.zeros(len(outliers.index)) diff --git a/freqtrade/freqai/base_models/BaseRegressionModel.py b/freqtrade/freqai/base_models/BaseRegressionModel.py index f1e33bff8..3cce978b5 100644 --- a/freqtrade/freqai/base_models/BaseRegressionModel.py +++ b/freqtrade/freqai/base_models/BaseRegressionModel.py @@ -111,7 +111,7 @@ class BaseRegressionModel(IFreqaiModel): pred_df = DataFrame(predictions, columns=dk.label_list) pred_df, _, _ = dk.label_pipeline.inverse_transform(pred_df) - if self.freqai_info.get("DI_threshold", 0) > 0: + if dk.feature_pipeline["di"]: dk.DI_values = dk.feature_pipeline["di"].di_values else: dk.DI_values = np.zeros(len(outliers.index)) From b0ab400ff36b5c60105540ead2c4a6d4ea177b0a Mon Sep 17 00:00:00 2001 From: robcaulk Date: Sat, 17 Jun 2023 15:39:33 +0200 Subject: [PATCH 29/32] fix: ensure test_size=0 is still accommodated --- .../freqai/RL/BaseReinforcementLearningModel.py | 11 ++++++----- .../freqai/base_models/BaseClassifierModel.py | 11 ++++++----- .../freqai/base_models/BasePyTorchClassifier.py | 11 ++++++----- .../freqai/base_models/BasePyTorchRegressor.py | 13 ++++++++----- .../freqai/base_models/BaseRegressionModel.py | 16 ++++++++-------- 5 files changed, 34 insertions(+), 28 deletions(-) diff --git a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py index 81cacc055..4f7b55967 100644 --- a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py +++ b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py @@ -126,11 +126,12 @@ class BaseReinforcementLearningModel(IFreqaiModel): dd["train_labels"], dd["train_weights"]) - (dd["test_features"], - dd["test_labels"], - dd["test_weights"]) = dk.feature_pipeline.transform(dd["test_features"], - dd["test_labels"], - dd["test_weights"]) + if self.freqai_info.get('data_split_parameters', {}).get('test_size', 0.1) != 0: + (dd["test_features"], + dd["test_labels"], + dd["test_weights"]) = dk.feature_pipeline.transform(dd["test_features"], + dd["test_labels"], + dd["test_weights"]) logger.info( f'Training model on {len(dk.data_dictionary["train_features"].columns)}' diff --git a/freqtrade/freqai/base_models/BaseClassifierModel.py b/freqtrade/freqai/base_models/BaseClassifierModel.py index e536efea3..0a6100df3 100644 --- a/freqtrade/freqai/base_models/BaseClassifierModel.py +++ b/freqtrade/freqai/base_models/BaseClassifierModel.py @@ -61,11 +61,12 @@ class BaseClassifierModel(IFreqaiModel): dd["train_labels"], dd["train_weights"]) - (dd["test_features"], - dd["test_labels"], - dd["test_weights"]) = dk.feature_pipeline.transform(dd["test_features"], - dd["test_labels"], - dd["test_weights"]) + if self.freqai_info.get('data_split_parameters', {}).get('test_size', 0.1) != 0: + (dd["test_features"], + dd["test_labels"], + dd["test_weights"]) = dk.feature_pipeline.transform(dd["test_features"], + dd["test_labels"], + dd["test_weights"]) logger.info( f"Training model on {len(dk.data_dictionary['train_features'].columns)} features" diff --git a/freqtrade/freqai/base_models/BasePyTorchClassifier.py b/freqtrade/freqai/base_models/BasePyTorchClassifier.py index 57f31629a..8a4e15308 100644 --- a/freqtrade/freqai/base_models/BasePyTorchClassifier.py +++ b/freqtrade/freqai/base_models/BasePyTorchClassifier.py @@ -197,11 +197,12 @@ class BasePyTorchClassifier(BasePyTorchModel): dd["train_labels"], dd["train_weights"]) - (dd["test_features"], - dd["test_labels"], - dd["test_weights"]) = dk.feature_pipeline.transform(dd["test_features"], - dd["test_labels"], - dd["test_weights"]) + if self.freqai_info.get('data_split_parameters', {}).get('test_size', 0.1) != 0: + (dd["test_features"], + dd["test_labels"], + dd["test_weights"]) = dk.feature_pipeline.transform(dd["test_features"], + dd["test_labels"], + dd["test_weights"]) logger.info( f"Training model on {len(dk.data_dictionary['train_features'].columns)} features" diff --git a/freqtrade/freqai/base_models/BasePyTorchRegressor.py b/freqtrade/freqai/base_models/BasePyTorchRegressor.py index b77fec31a..325743134 100644 --- a/freqtrade/freqai/base_models/BasePyTorchRegressor.py +++ b/freqtrade/freqai/base_models/BasePyTorchRegressor.py @@ -96,12 +96,15 @@ class BasePyTorchRegressor(BasePyTorchModel): dd["train_weights"]) = dk.feature_pipeline.fit_transform(dd["train_features"], dd["train_labels"], dd["train_weights"]) + dd["train_labels"], _, _ = dk.label_pipeline.fit_transform(dd["train_labels"]) - (dd["test_features"], - dd["test_labels"], - dd["test_weights"]) = dk.feature_pipeline.transform(dd["test_features"], - dd["test_labels"], - dd["test_weights"]) + if self.freqai_info.get('data_split_parameters', {}).get('test_size', 0.1) != 0: + (dd["test_features"], + dd["test_labels"], + dd["test_weights"]) = dk.feature_pipeline.transform(dd["test_features"], + dd["test_labels"], + dd["test_weights"]) + dd["test_labels"], _, _ = dk.label_pipeline.transform(dd["test_labels"]) logger.info( f"Training model on {len(dk.data_dictionary['train_features'].columns)} features" diff --git a/freqtrade/freqai/base_models/BaseRegressionModel.py b/freqtrade/freqai/base_models/BaseRegressionModel.py index 3cce978b5..2e07d3fb7 100644 --- a/freqtrade/freqai/base_models/BaseRegressionModel.py +++ b/freqtrade/freqai/base_models/BaseRegressionModel.py @@ -60,15 +60,15 @@ class BaseRegressionModel(IFreqaiModel): dd["train_weights"]) = dk.feature_pipeline.fit_transform(dd["train_features"], dd["train_labels"], dd["train_weights"]) - - (dd["test_features"], - dd["test_labels"], - dd["test_weights"]) = dk.feature_pipeline.transform(dd["test_features"], - dd["test_labels"], - dd["test_weights"]) - dd["train_labels"], _, _ = dk.label_pipeline.fit_transform(dd["train_labels"]) - dd["test_labels"], _, _ = dk.label_pipeline.transform(dd["test_labels"]) + + if self.freqai_info.get('data_split_parameters', {}).get('test_size', 0.1) != 0: + (dd["test_features"], + dd["test_labels"], + dd["test_weights"]) = dk.feature_pipeline.transform(dd["test_features"], + dd["test_labels"], + dd["test_weights"]) + dd["test_labels"], _, _ = dk.label_pipeline.transform(dd["test_labels"]) logger.info( f"Training model on {len(dk.data_dictionary['train_features'].columns)} features" From 886b86f7c567e46e589717bb6691dbed6f22e6bd Mon Sep 17 00:00:00 2001 From: robcaulk Date: Sat, 17 Jun 2023 16:14:48 +0200 Subject: [PATCH 30/32] chore: bump datasieve --- requirements-freqai.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-freqai.txt b/requirements-freqai.txt index b9a50d92f..0d88976d0 100644 --- a/requirements-freqai.txt +++ b/requirements-freqai.txt @@ -9,4 +9,4 @@ catboost==1.2; 'arm' not in platform_machine lightgbm==3.3.5 xgboost==1.7.5 tensorboard==2.13.0 -datasieve==0.1.4 +datasieve==0.1.5 diff --git a/setup.py b/setup.py index 996a8b8f9..4b73ae653 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ freqai = [ 'lightgbm', 'xgboost', 'tensorboard', - 'datasieve>=0.1.4' + 'datasieve>=0.1.5' ] freqai_rl = [ From 7e2f857aa5fcc2273d4d114ff9d0e65ddbcac741 Mon Sep 17 00:00:00 2001 From: Robert Caulk Date: Sun, 18 Jun 2023 11:30:33 +0200 Subject: [PATCH 31/32] Update BasePyTorchClassifier.py --- freqtrade/freqai/base_models/BasePyTorchClassifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/freqtrade/freqai/base_models/BasePyTorchClassifier.py b/freqtrade/freqai/base_models/BasePyTorchClassifier.py index 8a4e15308..c47c5069a 100644 --- a/freqtrade/freqai/base_models/BasePyTorchClassifier.py +++ b/freqtrade/freqai/base_models/BasePyTorchClassifier.py @@ -91,7 +91,7 @@ class BasePyTorchClassifier(BasePyTorchModel): pred_df = DataFrame(predicted_classes_str, columns=[dk.label_list[0]]) pred_df = pd.concat([pred_df, pred_df_prob], axis=1) - if self.freqai_info.get("DI_threshold", 0) > 0: + if dk.feature_pipeline["di"]: dk.DI_values = dk.feature_pipeline["di"].di_values else: dk.DI_values = np.zeros(len(outliers.index)) From cca4fa1178e8186f21e1b90645a82e7fce8ac9c4 Mon Sep 17 00:00:00 2001 From: Robert Caulk Date: Sun, 18 Jun 2023 11:31:03 +0200 Subject: [PATCH 32/32] Update BaseClassifierModel.py --- freqtrade/freqai/base_models/BaseClassifierModel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/freqtrade/freqai/base_models/BaseClassifierModel.py b/freqtrade/freqai/base_models/BaseClassifierModel.py index 0a6100df3..f35b07e66 100644 --- a/freqtrade/freqai/base_models/BaseClassifierModel.py +++ b/freqtrade/freqai/base_models/BaseClassifierModel.py @@ -117,7 +117,7 @@ class BaseClassifierModel(IFreqaiModel): pred_df = pd.concat([pred_df, pred_df_prob], axis=1) - if self.freqai_info.get("DI_threshold", 0) > 0: + if dk.feature_pipeline["di"]: dk.DI_values = dk.feature_pipeline["di"].di_values else: dk.DI_values = np.zeros(len(outliers.index))