start transition toward outsourcing the data pipeline with objective of improving pipeline flexibility

This commit is contained in:
robcaulk
2023-05-26 18:40:14 +02:00
parent c23a045de4
commit 31e19add27
8 changed files with 579 additions and 586 deletions

View File

@@ -7,14 +7,15 @@ import torch
from pandas import DataFrame from pandas import DataFrame
from freqtrade.freqai.data_kitchen import FreqaiDataKitchen from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
from freqtrade.freqai.freqai_interface import IFreqaiModel # from freqtrade.freqai.freqai_interface import IFreqaiModel
from freqtrade.freqai.base_models import BaseRegressionModel
from freqtrade.freqai.torch.PyTorchDataConvertor import PyTorchDataConvertor from freqtrade.freqai.torch.PyTorchDataConvertor import PyTorchDataConvertor
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class BasePyTorchModel(IFreqaiModel, ABC): class BasePyTorchModel(BaseRegressionModel):
""" """
Base class for PyTorch type models. Base class for PyTorch type models.
User *must* inherit from this class and set fit() and predict() and User *must* inherit from this class and set fit() and predict() and
@@ -29,50 +30,50 @@ class BasePyTorchModel(IFreqaiModel, ABC):
self.splits = ["train", "test"] if test_size != 0 else ["train"] self.splits = ["train", "test"] if test_size != 0 else ["train"]
self.window_size = self.freqai_info.get("conv_width", 1) self.window_size = self.freqai_info.get("conv_width", 1)
def train( # def train(
self, unfiltered_df: DataFrame, pair: str, dk: FreqaiDataKitchen, **kwargs # self, unfiltered_df: DataFrame, pair: str, dk: FreqaiDataKitchen, **kwargs
) -> Any: # ) -> Any:
""" # """
Filter the training data and train a model to it. Train makes heavy use of the datakitchen # Filter the training data and train a model to it. Train makes heavy use of the datakitchen
for storing, saving, loading, and analyzing the data. # for storing, saving, loading, and analyzing the data.
:param unfiltered_df: Full dataframe for the current training period # :param unfiltered_df: Full dataframe for the current training period
:return: # :return:
:model: Trained model which can be used to inference (self.predict) # :model: Trained model which can be used to inference (self.predict)
""" # """
logger.info(f"-------------------- Starting training {pair} --------------------") # logger.info(f"-------------------- Starting training {pair} --------------------")
start_time = time() # start_time = time()
features_filtered, labels_filtered = dk.filter_features( # features_filtered, labels_filtered = dk.filter_features(
unfiltered_df, # unfiltered_df,
dk.training_features_list, # dk.training_features_list,
dk.label_list, # dk.label_list,
training_filter=True, # training_filter=True,
) # )
# split data into train/test data. # # split data into train/test data.
data_dictionary = dk.make_train_test_datasets(features_filtered, labels_filtered) # data_dictionary = dk.make_train_test_datasets(features_filtered, labels_filtered)
if not self.freqai_info.get("fit_live_predictions", 0) or not self.live: # if not self.freqai_info.get("fit_live_predictions", 0) or not self.live:
dk.fit_labels() # dk.fit_labels()
# normalize all data based on train_dataset only # # normalize all data based on train_dataset only
data_dictionary = dk.normalize_data(data_dictionary) # data_dictionary = dk.normalize_data(data_dictionary)
# optional additional data cleaning/analysis # # optional additional data cleaning/analysis
self.data_cleaning_train(dk) # self.data_cleaning_train(dk)
logger.info( # logger.info(
f"Training model on {len(dk.data_dictionary['train_features'].columns)} features" # f"Training model on {len(dk.data_dictionary['train_features'].columns)} features"
) # )
logger.info(f"Training model on {len(data_dictionary['train_features'])} data points") # logger.info(f"Training model on {len(data_dictionary['train_features'])} data points")
model = self.fit(data_dictionary, dk) # model = self.fit(data_dictionary, dk)
end_time = time() # end_time = time()
logger.info(f"-------------------- Done training {pair} " # logger.info(f"-------------------- Done training {pair} "
f"({end_time - start_time:.2f} secs) --------------------") # f"({end_time - start_time:.2f} secs) --------------------")
return model # return model
@property @property
@abstractmethod @abstractmethod

View File

@@ -49,21 +49,34 @@ class BaseRegressionModel(IFreqaiModel):
logger.info(f"-------------------- Training on data from {start_date} to " logger.info(f"-------------------- Training on data from {start_date} to "
f"{end_date} --------------------") f"{end_date} --------------------")
# split data into train/test data. # split data into train/test data.
data_dictionary = dk.make_train_test_datasets(features_filtered, labels_filtered) d = dk.make_train_test_datasets(features_filtered, labels_filtered)
if not self.freqai_info.get("fit_live_predictions_candles", 0) or not self.live: if not self.freqai_info.get("fit_live_predictions_candles", 0) or not self.live:
dk.fit_labels() dk.fit_labels()
# normalize all data based on train_dataset only
data_dictionary = dk.normalize_data(data_dictionary)
# optional additional data cleaning/analysis self.define_data_pipeline(dk)
self.data_cleaning_train(dk) self.define_label_pipeline(dk)
d["train_labels"], _, _ = dk.label_pipeline.fit_transform(d["train_labels"])
d["test_labels"], _, _ = dk.label_pipeline.transform(d["test_labels"])
(d["train_features"],
d["train_labels"],
d["train_weights"]) = dk.pipeline.fit_transform(d["train_features"],
d["train_labels"],
d["train_weights"])
(d["test_features"],
d["test_labels"],
d["test_weights"]) = dk.pipeline.transform(d["test_features"],
d["test_labels"],
d["test_weights"])
logger.info( logger.info(
f"Training model on {len(dk.data_dictionary['train_features'].columns)} features" f"Training model on {len(dk.data_dictionary['train_features'].columns)} features"
) )
logger.info(f"Training model on {len(data_dictionary['train_features'])} data points") logger.info(f"Training model on {len(d['train_features'])} data points")
model = self.fit(data_dictionary, dk) model = self.fit(d, dk)
end_time = time() end_time = time()
@@ -88,11 +101,11 @@ class BaseRegressionModel(IFreqaiModel):
filtered_df, _ = dk.filter_features( filtered_df, _ = dk.filter_features(
unfiltered_df, dk.training_features_list, training_filter=False unfiltered_df, dk.training_features_list, training_filter=False
) )
filtered_df = dk.normalize_data_from_metadata(filtered_df) # filtered_df = dk.normalize_data_from_metadata(filtered_df)
dk.data_dictionary["prediction_features"] = filtered_df dk.data_dictionary["prediction_features"] = filtered_df
# optional additional data cleaning/analysis dk.data_dictionary["prediction_features"], outliers, _ = dk.pipeline.transform(
self.data_cleaning_predict(dk) dk.data_dictionary["prediction_features"], outlier_check=True)
predictions = self.model.predict(dk.data_dictionary["prediction_features"]) predictions = self.model.predict(dk.data_dictionary["prediction_features"])
if self.CONV_WIDTH == 1: if self.CONV_WIDTH == 1:
@@ -100,6 +113,8 @@ class BaseRegressionModel(IFreqaiModel):
pred_df = DataFrame(predictions, columns=dk.label_list) pred_df = DataFrame(predictions, columns=dk.label_list)
pred_df = dk.denormalize_labels_from_metadata(pred_df) pred_df, _, _ = dk.label_pipeline.inverse_transform(pred_df)
dk.DI_values = dk.label_pipeline.get_step("di").di_values
dk.do_predict = outliers.to_numpy()
return (pred_df, dk.do_predict) return (pred_df, dk.do_predict)

View File

@@ -1,70 +0,0 @@
import logging
from time import time
from typing import Any
from pandas import DataFrame
from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
from freqtrade.freqai.freqai_interface import IFreqaiModel
logger = logging.getLogger(__name__)
class BaseTensorFlowModel(IFreqaiModel):
"""
Base class for TensorFlow type models.
User *must* inherit from this class and set fit() and predict().
"""
def train(
self, unfiltered_df: DataFrame, pair: str, dk: FreqaiDataKitchen, **kwargs
) -> Any:
"""
Filter the training data and train a model to it. Train makes heavy use of the datakitchen
for storing, saving, loading, and analyzing the data.
:param unfiltered_df: Full dataframe for the current training period
:param metadata: pair metadata from strategy.
:return:
:model: Trained model which can be used to inference (self.predict)
"""
logger.info(f"-------------------- Starting training {pair} --------------------")
start_time = time()
# filter the features requested by user in the configuration file and elegantly handle NaNs
features_filtered, labels_filtered = dk.filter_features(
unfiltered_df,
dk.training_features_list,
dk.label_list,
training_filter=True,
)
start_date = unfiltered_df["date"].iloc[0].strftime("%Y-%m-%d")
end_date = unfiltered_df["date"].iloc[-1].strftime("%Y-%m-%d")
logger.info(f"-------------------- Training on data from {start_date} to "
f"{end_date} --------------------")
# split data into train/test data.
data_dictionary = dk.make_train_test_datasets(features_filtered, labels_filtered)
if not self.freqai_info.get("fit_live_predictions_candles", 0) or not self.live:
dk.fit_labels()
# normalize all data based on train_dataset only
data_dictionary = dk.normalize_data(data_dictionary)
# optional additional data cleaning/analysis
self.data_cleaning_train(dk)
logger.info(
f"Training model on {len(dk.data_dictionary['train_features'].columns)} features"
)
logger.info(f"Training model on {len(data_dictionary['train_features'])} data points")
model = self.fit(data_dictionary, dk)
end_time = time()
logger.info(f"-------------------- Done training {pair} "
f"({end_time - start_time:.2f} secs) --------------------")
return model

View File

@@ -460,6 +460,13 @@ class FreqaiDataDrawer:
with (save_path / f"{dk.model_filename}_metadata.json").open("w") as fp: with (save_path / f"{dk.model_filename}_metadata.json").open("w") as fp:
rapidjson.dump(dk.data, fp, default=self.np_encoder, number_mode=rapidjson.NM_NATIVE) rapidjson.dump(dk.data, fp, default=self.np_encoder, number_mode=rapidjson.NM_NATIVE)
# save the pipelines to pickle files
with (save_path / f"{dk.model_filename}_pipeline.pkl").open("wb") as fp:
cloudpickle.dump(dk.pipeline, fp)
with (save_path / f"{dk.model_filename}_label_pipeline.pkl").open("wb") as fp:
cloudpickle.dump(dk.label_pipeline, fp)
# save the train data to file so we can check preds for area of applicability later # save the train data to file so we can check preds for area of applicability later
dk.data_dictionary["train_features"].to_pickle( dk.data_dictionary["train_features"].to_pickle(
save_path / f"{dk.model_filename}_trained_df.pkl" save_path / f"{dk.model_filename}_trained_df.pkl"
@@ -482,6 +489,8 @@ class FreqaiDataDrawer:
self.meta_data_dictionary[coin] = {} self.meta_data_dictionary[coin] = {}
self.meta_data_dictionary[coin]["train_df"] = dk.data_dictionary["train_features"] self.meta_data_dictionary[coin]["train_df"] = dk.data_dictionary["train_features"]
self.meta_data_dictionary[coin]["meta_data"] = dk.data self.meta_data_dictionary[coin]["meta_data"] = dk.data
self.meta_data_dictionary[coin]["pipeline"] = dk.pipeline
self.meta_data_dictionary[coin]["label_pipeline"] = dk.label_pipeline
self.save_drawer_to_disk() self.save_drawer_to_disk()
return return
@@ -513,6 +522,8 @@ class FreqaiDataDrawer:
if coin in self.meta_data_dictionary: if coin in self.meta_data_dictionary:
dk.data = self.meta_data_dictionary[coin]["meta_data"] dk.data = self.meta_data_dictionary[coin]["meta_data"]
dk.data_dictionary["train_features"] = self.meta_data_dictionary[coin]["train_df"] dk.data_dictionary["train_features"] = self.meta_data_dictionary[coin]["train_df"]
dk.pipeline = self.meta_data_dictionary[coin]["pipeline"]
dk.label_pipeline = self.meta_data_dictionary[coin]["label_pipeline"]
else: else:
with (dk.data_path / f"{dk.model_filename}_metadata.json").open("r") as fp: with (dk.data_path / f"{dk.model_filename}_metadata.json").open("r") as fp:
dk.data = rapidjson.load(fp, number_mode=rapidjson.NM_NATIVE) dk.data = rapidjson.load(fp, number_mode=rapidjson.NM_NATIVE)
@@ -520,6 +531,10 @@ class FreqaiDataDrawer:
dk.data_dictionary["train_features"] = pd.read_pickle( dk.data_dictionary["train_features"] = pd.read_pickle(
dk.data_path / f"{dk.model_filename}_trained_df.pkl" dk.data_path / f"{dk.model_filename}_trained_df.pkl"
) )
with (dk.data_path / f"{dk.model_filename}_pipeline.pkl").open("rb") as fp:
dk.pipeline = cloudpickle.load(fp)
with (dk.data_path / f"{dk.model_filename}_label_pipeline.pkl").open("rb") as fp:
dk.label_pipeline = cloudpickle.load(fp)
dk.training_features_list = dk.data["training_features_list"] dk.training_features_list = dk.data["training_features_list"]
dk.label_list = dk.data["label_list"] dk.label_list = dk.data["label_list"]

View File

@@ -27,6 +27,7 @@ from freqtrade.exceptions import OperationalException
from freqtrade.exchange import timeframe_to_seconds from freqtrade.exchange import timeframe_to_seconds
from freqtrade.strategy import merge_informative_pair from freqtrade.strategy import merge_informative_pair
from freqtrade.strategy.interface import IStrategy from freqtrade.strategy.interface import IStrategy
from datasieve.pipeline import Pipeline
SECONDS_IN_DAY = 86400 SECONDS_IN_DAY = 86400
@@ -86,6 +87,8 @@ class FreqaiDataKitchen:
self.keras: bool = self.freqai_config.get("keras", False) self.keras: bool = self.freqai_config.get("keras", False)
self.set_all_pairs() self.set_all_pairs()
self.backtest_live_models = config.get("freqai_backtest_live_models", False) self.backtest_live_models = config.get("freqai_backtest_live_models", False)
self.pipeline = Pipeline()
self.label_pipeline = Pipeline()
if not self.live: if not self.live:
self.full_path = self.get_full_models_path(self.config) self.full_path = self.get_full_models_path(self.config)
@@ -307,106 +310,106 @@ class FreqaiDataKitchen:
return self.data_dictionary return self.data_dictionary
def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]: # def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]:
""" # """
Normalize all data in the data_dictionary according to the training dataset # Normalize all data in the data_dictionary according to the training dataset
:param data_dictionary: dictionary containing the cleaned and # :param data_dictionary: dictionary containing the cleaned and
split training/test data/labels # split training/test data/labels
:returns: # :returns:
:data_dictionary: updated dictionary with standardized values. # :data_dictionary: updated dictionary with standardized values.
""" # """
# standardize the data by training stats # # standardize the data by training stats
train_max = data_dictionary["train_features"].max() # train_max = data_dictionary["train_features"].max()
train_min = data_dictionary["train_features"].min() # train_min = data_dictionary["train_features"].min()
data_dictionary["train_features"] = ( # data_dictionary["train_features"] = (
2 * (data_dictionary["train_features"] - train_min) / (train_max - train_min) - 1 # 2 * (data_dictionary["train_features"] - train_min) / (train_max - train_min) - 1
) # )
data_dictionary["test_features"] = ( # data_dictionary["test_features"] = (
2 * (data_dictionary["test_features"] - train_min) / (train_max - train_min) - 1 # 2 * (data_dictionary["test_features"] - train_min) / (train_max - train_min) - 1
) # )
for item in train_max.keys(): # for item in train_max.keys():
self.data[item + "_max"] = train_max[item] # self.data[item + "_max"] = train_max[item]
self.data[item + "_min"] = train_min[item] # self.data[item + "_min"] = train_min[item]
for item in data_dictionary["train_labels"].keys(): # for item in data_dictionary["train_labels"].keys():
if data_dictionary["train_labels"][item].dtype == object: # if data_dictionary["train_labels"][item].dtype == object:
continue # continue
train_labels_max = data_dictionary["train_labels"][item].max() # train_labels_max = data_dictionary["train_labels"][item].max()
train_labels_min = data_dictionary["train_labels"][item].min() # train_labels_min = data_dictionary["train_labels"][item].min()
data_dictionary["train_labels"][item] = ( # data_dictionary["train_labels"][item] = (
2 # 2
* (data_dictionary["train_labels"][item] - train_labels_min) # * (data_dictionary["train_labels"][item] - train_labels_min)
/ (train_labels_max - train_labels_min) # / (train_labels_max - train_labels_min)
- 1 # - 1
) # )
if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0: # if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0:
data_dictionary["test_labels"][item] = ( # data_dictionary["test_labels"][item] = (
2 # 2
* (data_dictionary["test_labels"][item] - train_labels_min) # * (data_dictionary["test_labels"][item] - train_labels_min)
/ (train_labels_max - train_labels_min) # / (train_labels_max - train_labels_min)
- 1 # - 1
) # )
self.data[f"{item}_max"] = train_labels_max # self.data[f"{item}_max"] = train_labels_max
self.data[f"{item}_min"] = train_labels_min # self.data[f"{item}_min"] = train_labels_min
return data_dictionary # return data_dictionary
def normalize_single_dataframe(self, df: DataFrame) -> DataFrame: # def normalize_single_dataframe(self, df: DataFrame) -> DataFrame:
train_max = df.max() # train_max = df.max()
train_min = df.min() # train_min = df.min()
df = ( # df = (
2 * (df - train_min) / (train_max - train_min) - 1 # 2 * (df - train_min) / (train_max - train_min) - 1
) # )
for item in train_max.keys(): # for item in train_max.keys():
self.data[item + "_max"] = train_max[item] # self.data[item + "_max"] = train_max[item]
self.data[item + "_min"] = train_min[item] # self.data[item + "_min"] = train_min[item]
return df # return df
def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame: # def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame:
""" # """
Normalize a set of data using the mean and standard deviation from # Normalize a set of data using the mean and standard deviation from
the associated training data. # the associated training data.
:param df: Dataframe to be standardized # :param df: Dataframe to be standardized
""" # """
train_max = [None] * len(df.keys()) # train_max = [None] * len(df.keys())
train_min = [None] * len(df.keys()) # train_min = [None] * len(df.keys())
for i, item in enumerate(df.keys()): # for i, item in enumerate(df.keys()):
train_max[i] = self.data[f"{item}_max"] # train_max[i] = self.data[f"{item}_max"]
train_min[i] = self.data[f"{item}_min"] # train_min[i] = self.data[f"{item}_min"]
train_max_series = pd.Series(train_max, index=df.keys()) # train_max_series = pd.Series(train_max, index=df.keys())
train_min_series = pd.Series(train_min, index=df.keys()) # train_min_series = pd.Series(train_min, index=df.keys())
df = ( # df = (
2 * (df - train_min_series) / (train_max_series - train_min_series) - 1 # 2 * (df - train_min_series) / (train_max_series - train_min_series) - 1
) # )
return df # return df
def denormalize_labels_from_metadata(self, df: DataFrame) -> DataFrame: # def denormalize_labels_from_metadata(self, df: DataFrame) -> DataFrame:
""" # """
Denormalize a set of data using the mean and standard deviation from # Denormalize a set of data using the mean and standard deviation from
the associated training data. # the associated training data.
:param df: Dataframe of predictions to be denormalized # :param df: Dataframe of predictions to be denormalized
""" # """
for label in df.columns: # for label in df.columns:
if df[label].dtype == object or label in self.unique_class_list: # if df[label].dtype == object or label in self.unique_class_list:
continue # continue
df[label] = ( # df[label] = (
(df[label] + 1) # (df[label] + 1)
* (self.data[f"{label}_max"] - self.data[f"{label}_min"]) # * (self.data[f"{label}_max"] - self.data[f"{label}_min"])
/ 2 # / 2
) + self.data[f"{label}_min"] # ) + self.data[f"{label}_min"]
return df # return df
def split_timerange( def split_timerange(
self, tr: str, train_split: int = 28, bt_split: float = 7 self, tr: str, train_split: int = 28, bt_split: float = 7
@@ -501,398 +504,398 @@ class FreqaiDataKitchen:
return df_predictions return df_predictions
def principal_component_analysis(self) -> None: # def principal_component_analysis(self) -> None:
""" # """
Performs Principal Component Analysis on the data for dimensionality reduction # Performs Principal Component Analysis on the data for dimensionality reduction
and outlier detection (see self.remove_outliers()) # and outlier detection (see self.remove_outliers())
No parameters or returns, it acts on the data_dictionary held by the DataHandler. # No parameters or returns, it acts on the data_dictionary held by the DataHandler.
""" # """
from sklearn.decomposition import PCA # avoid importing if we dont need it # from sklearn.decomposition import PCA # avoid importing if we dont need it
pca = PCA(0.999) # pca = PCA(0.999)
pca = pca.fit(self.data_dictionary["train_features"]) # pca = pca.fit(self.data_dictionary["train_features"])
n_keep_components = pca.n_components_ # n_keep_components = pca.n_components_
self.data["n_kept_components"] = n_keep_components # self.data["n_kept_components"] = n_keep_components
n_components = self.data_dictionary["train_features"].shape[1] # n_components = self.data_dictionary["train_features"].shape[1]
logger.info("reduced feature dimension by %s", n_components - n_keep_components) # logger.info("reduced feature dimension by %s", n_components - n_keep_components)
logger.info("explained variance %f", np.sum(pca.explained_variance_ratio_)) # logger.info("explained variance %f", np.sum(pca.explained_variance_ratio_))
train_components = pca.transform(self.data_dictionary["train_features"]) # train_components = pca.transform(self.data_dictionary["train_features"])
self.data_dictionary["train_features"] = pd.DataFrame( # self.data_dictionary["train_features"] = pd.DataFrame(
data=train_components, # data=train_components,
columns=["PC" + str(i) for i in range(0, n_keep_components)], # columns=["PC" + str(i) for i in range(0, n_keep_components)],
index=self.data_dictionary["train_features"].index, # index=self.data_dictionary["train_features"].index,
) # )
# normalsing transformed training features # # normalsing transformed training features
self.data_dictionary["train_features"] = self.normalize_single_dataframe( # self.data_dictionary["train_features"] = self.normalize_single_dataframe(
self.data_dictionary["train_features"]) # self.data_dictionary["train_features"])
# keeping a copy of the non-transformed features so we can check for errors during # # keeping a copy of the non-transformed features so we can check for errors during
# model load from disk # # model load from disk
self.data["training_features_list_raw"] = copy.deepcopy(self.training_features_list) # self.data["training_features_list_raw"] = copy.deepcopy(self.training_features_list)
self.training_features_list = self.data_dictionary["train_features"].columns # self.training_features_list = self.data_dictionary["train_features"].columns
if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0: # if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0:
test_components = pca.transform(self.data_dictionary["test_features"]) # test_components = pca.transform(self.data_dictionary["test_features"])
self.data_dictionary["test_features"] = pd.DataFrame( # self.data_dictionary["test_features"] = pd.DataFrame(
data=test_components, # data=test_components,
columns=["PC" + str(i) for i in range(0, n_keep_components)], # columns=["PC" + str(i) for i in range(0, n_keep_components)],
index=self.data_dictionary["test_features"].index, # index=self.data_dictionary["test_features"].index,
) # )
# normalise transformed test feature to transformed training features # # normalise transformed test feature to transformed training features
self.data_dictionary["test_features"] = self.normalize_data_from_metadata( # self.data_dictionary["test_features"] = self.normalize_data_from_metadata(
self.data_dictionary["test_features"]) # self.data_dictionary["test_features"])
self.data["n_kept_components"] = n_keep_components # self.data["n_kept_components"] = n_keep_components
self.pca = pca # self.pca = pca
logger.info(f"PCA reduced total features from {n_components} to {n_keep_components}") # logger.info(f"PCA reduced total features from {n_components} to {n_keep_components}")
if not self.data_path.is_dir(): # if not self.data_path.is_dir():
self.data_path.mkdir(parents=True, exist_ok=True) # self.data_path.mkdir(parents=True, exist_ok=True)
return None # return None
def pca_transform(self, filtered_dataframe: DataFrame) -> None: # def pca_transform(self, filtered_dataframe: DataFrame) -> None:
""" # """
Use an existing pca transform to transform data into components # Use an existing pca transform to transform data into components
:param filtered_dataframe: DataFrame = the cleaned dataframe # :param filtered_dataframe: DataFrame = the cleaned dataframe
""" # """
pca_components = self.pca.transform(filtered_dataframe) # pca_components = self.pca.transform(filtered_dataframe)
self.data_dictionary["prediction_features"] = pd.DataFrame( # self.data_dictionary["prediction_features"] = pd.DataFrame(
data=pca_components, # data=pca_components,
columns=["PC" + str(i) for i in range(0, self.data["n_kept_components"])], # columns=["PC" + str(i) for i in range(0, self.data["n_kept_components"])],
index=filtered_dataframe.index, # index=filtered_dataframe.index,
) # )
# normalise transformed predictions to transformed training features # # normalise transformed predictions to transformed training features
self.data_dictionary["prediction_features"] = self.normalize_data_from_metadata( # self.data_dictionary["prediction_features"] = self.normalize_data_from_metadata(
self.data_dictionary["prediction_features"]) # self.data_dictionary["prediction_features"])
def compute_distances(self) -> float: # def compute_distances(self) -> float:
""" # """
Compute distances between each training point and every other training # Compute distances between each training point and every other training
point. This metric defines the neighborhood of trained data and is used # point. This metric defines the neighborhood of trained data and is used
for prediction confidence in the Dissimilarity Index # for prediction confidence in the Dissimilarity Index
""" # """
# logger.info("computing average mean distance for all training points") # # logger.info("computing average mean distance for all training points")
pairwise = pairwise_distances( # pairwise = pairwise_distances(
self.data_dictionary["train_features"], n_jobs=self.thread_count) # self.data_dictionary["train_features"], n_jobs=self.thread_count)
# remove the diagonal distances which are itself distances ~0 # # remove the diagonal distances which are itself distances ~0
np.fill_diagonal(pairwise, np.NaN) # np.fill_diagonal(pairwise, np.NaN)
pairwise = pairwise.reshape(-1, 1) # pairwise = pairwise.reshape(-1, 1)
avg_mean_dist = pairwise[~np.isnan(pairwise)].mean() # avg_mean_dist = pairwise[~np.isnan(pairwise)].mean()
return avg_mean_dist # return avg_mean_dist
def get_outlier_percentage(self, dropped_pts: npt.NDArray) -> float: # def get_outlier_percentage(self, dropped_pts: npt.NDArray) -> float:
""" # """
Check if more than X% of points werer dropped during outlier detection. # Check if more than X% of points werer dropped during outlier detection.
""" # """
outlier_protection_pct = self.freqai_config["feature_parameters"].get( # outlier_protection_pct = self.freqai_config["feature_parameters"].get(
"outlier_protection_percentage", 30) # "outlier_protection_percentage", 30)
outlier_pct = (dropped_pts.sum() / len(dropped_pts)) * 100 # outlier_pct = (dropped_pts.sum() / len(dropped_pts)) * 100
if outlier_pct >= outlier_protection_pct: # if outlier_pct >= outlier_protection_pct:
return outlier_pct # return outlier_pct
else: # else:
return 0.0 # return 0.0
def use_SVM_to_remove_outliers(self, predict: bool) -> None: # def use_SVM_to_remove_outliers(self, predict: bool) -> None:
""" # """
Build/inference a Support Vector Machine to detect outliers # Build/inference a Support Vector Machine to detect outliers
in training data and prediction # in training data and prediction
:param predict: bool = If true, inference an existing SVM model, else construct one # :param predict: bool = If true, inference an existing SVM model, else construct one
""" # """
if self.keras: # if self.keras:
logger.warning( # logger.warning(
"SVM outlier removal not currently supported for Keras based models. " # "SVM outlier removal not currently supported for Keras based models. "
"Skipping user requested function." # "Skipping user requested function."
) # )
if predict: # if predict:
self.do_predict = np.ones(len(self.data_dictionary["prediction_features"])) # self.do_predict = np.ones(len(self.data_dictionary["prediction_features"]))
return # return
if predict: # if predict:
if not self.svm_model: # if not self.svm_model:
logger.warning("No svm model available for outlier removal") # logger.warning("No svm model available for outlier removal")
return # return
y_pred = self.svm_model.predict(self.data_dictionary["prediction_features"]) # y_pred = self.svm_model.predict(self.data_dictionary["prediction_features"])
do_predict = np.where(y_pred == -1, 0, y_pred) # do_predict = np.where(y_pred == -1, 0, y_pred)
if (len(do_predict) - do_predict.sum()) > 0: # if (len(do_predict) - do_predict.sum()) > 0:
logger.info(f"SVM tossed {len(do_predict) - do_predict.sum()} predictions.") # logger.info(f"SVM tossed {len(do_predict) - do_predict.sum()} predictions.")
self.do_predict += do_predict # self.do_predict += do_predict
self.do_predict -= 1 # self.do_predict -= 1
else: # else:
# use SGDOneClassSVM to increase speed? # # use SGDOneClassSVM to increase speed?
svm_params = self.freqai_config["feature_parameters"].get( # svm_params = self.freqai_config["feature_parameters"].get(
"svm_params", {"shuffle": False, "nu": 0.1}) # "svm_params", {"shuffle": False, "nu": 0.1})
self.svm_model = linear_model.SGDOneClassSVM(**svm_params).fit( # self.svm_model = linear_model.SGDOneClassSVM(**svm_params).fit(
self.data_dictionary["train_features"] # self.data_dictionary["train_features"]
) # )
y_pred = self.svm_model.predict(self.data_dictionary["train_features"]) # y_pred = self.svm_model.predict(self.data_dictionary["train_features"])
kept_points = np.where(y_pred == -1, 0, y_pred) # kept_points = np.where(y_pred == -1, 0, y_pred)
# keep_index = np.where(y_pred == 1) # # keep_index = np.where(y_pred == 1)
outlier_pct = self.get_outlier_percentage(1 - kept_points) # outlier_pct = self.get_outlier_percentage(1 - kept_points)
if outlier_pct: # if outlier_pct:
logger.warning( # logger.warning(
f"SVM detected {outlier_pct:.2f}% of the points as outliers. " # f"SVM detected {outlier_pct:.2f}% of the points as outliers. "
f"Keeping original dataset." # f"Keeping original dataset."
) # )
self.svm_model = None # self.svm_model = None
return # return
self.data_dictionary["train_features"] = self.data_dictionary["train_features"][ # self.data_dictionary["train_features"] = self.data_dictionary["train_features"][
(y_pred == 1) # (y_pred == 1)
] # ]
self.data_dictionary["train_labels"] = self.data_dictionary["train_labels"][ # self.data_dictionary["train_labels"] = self.data_dictionary["train_labels"][
(y_pred == 1) # (y_pred == 1)
] # ]
self.data_dictionary["train_weights"] = self.data_dictionary["train_weights"][ # self.data_dictionary["train_weights"] = self.data_dictionary["train_weights"][
(y_pred == 1) # (y_pred == 1)
] # ]
logger.info( # logger.info(
f"SVM tossed {len(y_pred) - kept_points.sum()}" # f"SVM tossed {len(y_pred) - kept_points.sum()}"
f" train points from {len(y_pred)} total points." # f" train points from {len(y_pred)} total points."
) # )
# same for test data # # same for test data
# TODO: This (and the part above) could be refactored into a separate function # # TODO: This (and the part above) could be refactored into a separate function
# to reduce code duplication # # to reduce code duplication
if self.freqai_config['data_split_parameters'].get('test_size', 0.1) != 0: # if self.freqai_config['data_split_parameters'].get('test_size', 0.1) != 0:
y_pred = self.svm_model.predict(self.data_dictionary["test_features"]) # y_pred = self.svm_model.predict(self.data_dictionary["test_features"])
kept_points = np.where(y_pred == -1, 0, y_pred) # kept_points = np.where(y_pred == -1, 0, y_pred)
self.data_dictionary["test_features"] = self.data_dictionary["test_features"][ # self.data_dictionary["test_features"] = self.data_dictionary["test_features"][
(y_pred == 1) # (y_pred == 1)
] # ]
self.data_dictionary["test_labels"] = self.data_dictionary["test_labels"][( # self.data_dictionary["test_labels"] = self.data_dictionary["test_labels"][(
y_pred == 1)] # y_pred == 1)]
self.data_dictionary["test_weights"] = self.data_dictionary["test_weights"][ # self.data_dictionary["test_weights"] = self.data_dictionary["test_weights"][
(y_pred == 1) # (y_pred == 1)
] # ]
logger.info( # logger.info(
f"{self.pair}: SVM tossed {len(y_pred) - kept_points.sum()}" # f"{self.pair}: SVM tossed {len(y_pred) - kept_points.sum()}"
f" test points from {len(y_pred)} total points." # f" test points from {len(y_pred)} total points."
) # )
return # return
def use_DBSCAN_to_remove_outliers(self, predict: bool, eps=None) -> None: # def use_DBSCAN_to_remove_outliers(self, predict: bool, eps=None) -> None:
""" # """
Use DBSCAN to cluster training data and remove "noisy" data (read outliers). # Use DBSCAN to cluster training data and remove "noisy" data (read outliers).
User controls this via the config param `DBSCAN_outlier_pct` which indicates the # User controls this via the config param `DBSCAN_outlier_pct` which indicates the
pct of training data that they want to be considered outliers. # pct of training data that they want to be considered outliers.
:param predict: bool = If False (training), iterate to find the best hyper parameters # :param predict: bool = If False (training), iterate to find the best hyper parameters
to match user requested outlier percent target. # to match user requested outlier percent target.
If True (prediction), use the parameters determined from # If True (prediction), use the parameters determined from
the previous training to estimate if the current prediction point # the previous training to estimate if the current prediction point
is an outlier. # is an outlier.
""" # """
if predict: # if predict:
if not self.data['DBSCAN_eps']: # if not self.data['DBSCAN_eps']:
return # return
train_ft_df = self.data_dictionary['train_features'] # train_ft_df = self.data_dictionary['train_features']
pred_ft_df = self.data_dictionary['prediction_features'] # pred_ft_df = self.data_dictionary['prediction_features']
num_preds = len(pred_ft_df) # num_preds = len(pred_ft_df)
df = pd.concat([train_ft_df, pred_ft_df], axis=0, ignore_index=True) # df = pd.concat([train_ft_df, pred_ft_df], axis=0, ignore_index=True)
clustering = DBSCAN(eps=self.data['DBSCAN_eps'], # clustering = DBSCAN(eps=self.data['DBSCAN_eps'],
min_samples=self.data['DBSCAN_min_samples'], # min_samples=self.data['DBSCAN_min_samples'],
n_jobs=self.thread_count # n_jobs=self.thread_count
).fit(df) # ).fit(df)
do_predict = np.where(clustering.labels_[-num_preds:] == -1, 0, 1) # do_predict = np.where(clustering.labels_[-num_preds:] == -1, 0, 1)
if (len(do_predict) - do_predict.sum()) > 0: # if (len(do_predict) - do_predict.sum()) > 0:
logger.info(f"DBSCAN tossed {len(do_predict) - do_predict.sum()} predictions") # logger.info(f"DBSCAN tossed {len(do_predict) - do_predict.sum()} predictions")
self.do_predict += do_predict # self.do_predict += do_predict
self.do_predict -= 1 # self.do_predict -= 1
else: # else:
def normalise_distances(distances): # def normalise_distances(distances):
normalised_distances = (distances - distances.min()) / \ # normalised_distances = (distances - distances.min()) / \
(distances.max() - distances.min()) # (distances.max() - distances.min())
return normalised_distances # return normalised_distances
def rotate_point(origin, point, angle): # def rotate_point(origin, point, angle):
# rotate a point counterclockwise by a given angle (in radians) # # rotate a point counterclockwise by a given angle (in radians)
# around a given origin # # around a given origin
x = origin[0] + cos(angle) * (point[0] - origin[0]) - \ # x = origin[0] + cos(angle) * (point[0] - origin[0]) - \
sin(angle) * (point[1] - origin[1]) # sin(angle) * (point[1] - origin[1])
y = origin[1] + sin(angle) * (point[0] - origin[0]) + \ # y = origin[1] + sin(angle) * (point[0] - origin[0]) + \
cos(angle) * (point[1] - origin[1]) # cos(angle) * (point[1] - origin[1])
return (x, y) # return (x, y)
MinPts = int(len(self.data_dictionary['train_features'].index) * 0.25) # MinPts = int(len(self.data_dictionary['train_features'].index) * 0.25)
# measure pairwise distances to nearest neighbours # # measure pairwise distances to nearest neighbours
neighbors = NearestNeighbors( # neighbors = NearestNeighbors(
n_neighbors=MinPts, n_jobs=self.thread_count) # n_neighbors=MinPts, n_jobs=self.thread_count)
neighbors_fit = neighbors.fit(self.data_dictionary['train_features']) # neighbors_fit = neighbors.fit(self.data_dictionary['train_features'])
distances, _ = neighbors_fit.kneighbors(self.data_dictionary['train_features']) # distances, _ = neighbors_fit.kneighbors(self.data_dictionary['train_features'])
distances = np.sort(distances, axis=0).mean(axis=1) # distances = np.sort(distances, axis=0).mean(axis=1)
normalised_distances = normalise_distances(distances) # normalised_distances = normalise_distances(distances)
x_range = np.linspace(0, 1, len(distances)) # x_range = np.linspace(0, 1, len(distances))
line = np.linspace(normalised_distances[0], # line = np.linspace(normalised_distances[0],
normalised_distances[-1], len(normalised_distances)) # normalised_distances[-1], len(normalised_distances))
deflection = np.abs(normalised_distances - line) # deflection = np.abs(normalised_distances - line)
max_deflection_loc = np.where(deflection == deflection.max())[0][0] # max_deflection_loc = np.where(deflection == deflection.max())[0][0]
origin = x_range[max_deflection_loc], line[max_deflection_loc] # origin = x_range[max_deflection_loc], line[max_deflection_loc]
point = x_range[max_deflection_loc], normalised_distances[max_deflection_loc] # point = x_range[max_deflection_loc], normalised_distances[max_deflection_loc]
rot_angle = np.pi / 4 # rot_angle = np.pi / 4
elbow_loc = rotate_point(origin, point, rot_angle) # elbow_loc = rotate_point(origin, point, rot_angle)
epsilon = elbow_loc[1] * (distances[-1] - distances[0]) + distances[0] # epsilon = elbow_loc[1] * (distances[-1] - distances[0]) + distances[0]
clustering = DBSCAN(eps=epsilon, min_samples=MinPts, # clustering = DBSCAN(eps=epsilon, min_samples=MinPts,
n_jobs=int(self.thread_count)).fit( # n_jobs=int(self.thread_count)).fit(
self.data_dictionary['train_features'] # self.data_dictionary['train_features']
) # )
logger.info(f'DBSCAN found eps of {epsilon:.2f}.') # logger.info(f'DBSCAN found eps of {epsilon:.2f}.')
self.data['DBSCAN_eps'] = epsilon # self.data['DBSCAN_eps'] = epsilon
self.data['DBSCAN_min_samples'] = MinPts # self.data['DBSCAN_min_samples'] = MinPts
dropped_points = np.where(clustering.labels_ == -1, 1, 0) # dropped_points = np.where(clustering.labels_ == -1, 1, 0)
outlier_pct = self.get_outlier_percentage(dropped_points) # outlier_pct = self.get_outlier_percentage(dropped_points)
if outlier_pct: # if outlier_pct:
logger.warning( # logger.warning(
f"DBSCAN detected {outlier_pct:.2f}% of the points as outliers. " # f"DBSCAN detected {outlier_pct:.2f}% of the points as outliers. "
f"Keeping original dataset." # f"Keeping original dataset."
) # )
self.data['DBSCAN_eps'] = 0 # self.data['DBSCAN_eps'] = 0
return # return
self.data_dictionary['train_features'] = self.data_dictionary['train_features'][ # self.data_dictionary['train_features'] = self.data_dictionary['train_features'][
(clustering.labels_ != -1) # (clustering.labels_ != -1)
] # ]
self.data_dictionary["train_labels"] = self.data_dictionary["train_labels"][ # self.data_dictionary["train_labels"] = self.data_dictionary["train_labels"][
(clustering.labels_ != -1) # (clustering.labels_ != -1)
] # ]
self.data_dictionary["train_weights"] = self.data_dictionary["train_weights"][ # self.data_dictionary["train_weights"] = self.data_dictionary["train_weights"][
(clustering.labels_ != -1) # (clustering.labels_ != -1)
] # ]
logger.info( # logger.info(
f"DBSCAN tossed {dropped_points.sum()}" # f"DBSCAN tossed {dropped_points.sum()}"
f" train points from {len(clustering.labels_)}" # f" train points from {len(clustering.labels_)}"
) # )
return # return
def compute_inlier_metric(self, set_='train') -> None: # def compute_inlier_metric(self, set_='train') -> None:
""" # """
Compute inlier metric from backwards distance distributions. # Compute inlier metric from backwards distance distributions.
This metric defines how well features from a timepoint fit # This metric defines how well features from a timepoint fit
into previous timepoints. # into previous timepoints.
""" # """
def normalise(dataframe: DataFrame, key: str) -> DataFrame: # def normalise(dataframe: DataFrame, key: str) -> DataFrame:
if set_ == 'train': # if set_ == 'train':
min_value = dataframe.min() # min_value = dataframe.min()
max_value = dataframe.max() # max_value = dataframe.max()
self.data[f'{key}_min'] = min_value # self.data[f'{key}_min'] = min_value
self.data[f'{key}_max'] = max_value # self.data[f'{key}_max'] = max_value
else: # else:
min_value = self.data[f'{key}_min'] # min_value = self.data[f'{key}_min']
max_value = self.data[f'{key}_max'] # max_value = self.data[f'{key}_max']
return (dataframe - min_value) / (max_value - min_value) # return (dataframe - min_value) / (max_value - min_value)
no_prev_pts = self.freqai_config["feature_parameters"]["inlier_metric_window"] # no_prev_pts = self.freqai_config["feature_parameters"]["inlier_metric_window"]
if set_ == 'train': # if set_ == 'train':
compute_df = copy.deepcopy(self.data_dictionary['train_features']) # compute_df = copy.deepcopy(self.data_dictionary['train_features'])
elif set_ == 'test': # elif set_ == 'test':
compute_df = copy.deepcopy(self.data_dictionary['test_features']) # compute_df = copy.deepcopy(self.data_dictionary['test_features'])
else: # else:
compute_df = copy.deepcopy(self.data_dictionary['prediction_features']) # compute_df = copy.deepcopy(self.data_dictionary['prediction_features'])
compute_df_reindexed = compute_df.reindex( # compute_df_reindexed = compute_df.reindex(
index=np.flip(compute_df.index) # index=np.flip(compute_df.index)
) # )
pairwise = pd.DataFrame( # pairwise = pd.DataFrame(
np.triu( # np.triu(
pairwise_distances(compute_df_reindexed, n_jobs=self.thread_count) # pairwise_distances(compute_df_reindexed, n_jobs=self.thread_count)
), # ),
columns=compute_df_reindexed.index, # columns=compute_df_reindexed.index,
index=compute_df_reindexed.index # index=compute_df_reindexed.index
) # )
pairwise = pairwise.round(5) # pairwise = pairwise.round(5)
column_labels = [ # column_labels = [
'{}{}'.format('d', i) for i in range(1, no_prev_pts + 1) # '{}{}'.format('d', i) for i in range(1, no_prev_pts + 1)
] # ]
distances = pd.DataFrame( # distances = pd.DataFrame(
columns=column_labels, index=compute_df.index # columns=column_labels, index=compute_df.index
) # )
for index in compute_df.index[no_prev_pts:]: # for index in compute_df.index[no_prev_pts:]:
current_row = pairwise.loc[[index]] # current_row = pairwise.loc[[index]]
current_row_no_zeros = current_row.loc[ # current_row_no_zeros = current_row.loc[
:, (current_row != 0).any(axis=0) # :, (current_row != 0).any(axis=0)
] # ]
distances.loc[[index]] = current_row_no_zeros.iloc[ # distances.loc[[index]] = current_row_no_zeros.iloc[
:, :no_prev_pts # :, :no_prev_pts
] # ]
distances = distances.replace([np.inf, -np.inf], np.nan) # distances = distances.replace([np.inf, -np.inf], np.nan)
drop_index = pd.isnull(distances).any(axis=1) # drop_index = pd.isnull(distances).any(axis=1)
distances = distances[drop_index == 0] # distances = distances[drop_index == 0]
inliers = pd.DataFrame(index=distances.index) # inliers = pd.DataFrame(index=distances.index)
for key in distances.keys(): # for key in distances.keys():
current_distances = distances[key].dropna() # current_distances = distances[key].dropna()
current_distances = normalise(current_distances, key) # current_distances = normalise(current_distances, key)
if set_ == 'train': # if set_ == 'train':
fit_params = stats.weibull_min.fit(current_distances) # fit_params = stats.weibull_min.fit(current_distances)
self.data[f'{key}_fit_params'] = fit_params # self.data[f'{key}_fit_params'] = fit_params
else: # else:
fit_params = self.data[f'{key}_fit_params'] # fit_params = self.data[f'{key}_fit_params']
quantiles = stats.weibull_min.cdf(current_distances, *fit_params) # quantiles = stats.weibull_min.cdf(current_distances, *fit_params)
df_inlier = pd.DataFrame( # df_inlier = pd.DataFrame(
{key: quantiles}, index=distances.index # {key: quantiles}, index=distances.index
) # )
inliers = pd.concat( # inliers = pd.concat(
[inliers, df_inlier], axis=1 # [inliers, df_inlier], axis=1
) # )
inlier_metric = pd.DataFrame( # inlier_metric = pd.DataFrame(
data=inliers.sum(axis=1) / no_prev_pts, # data=inliers.sum(axis=1) / no_prev_pts,
columns=['%-inlier_metric'], # columns=['%-inlier_metric'],
index=compute_df.index # index=compute_df.index
) # )
inlier_metric = (2 * (inlier_metric - inlier_metric.min()) / # inlier_metric = (2 * (inlier_metric - inlier_metric.min()) /
(inlier_metric.max() - inlier_metric.min()) - 1) # (inlier_metric.max() - inlier_metric.min()) - 1)
if set_ in ('train', 'test'): # if set_ in ('train', 'test'):
inlier_metric = inlier_metric.iloc[no_prev_pts:] # inlier_metric = inlier_metric.iloc[no_prev_pts:]
compute_df = compute_df.iloc[no_prev_pts:] # compute_df = compute_df.iloc[no_prev_pts:]
self.remove_beginning_points_from_data_dict(set_, no_prev_pts) # self.remove_beginning_points_from_data_dict(set_, no_prev_pts)
self.data_dictionary[f'{set_}_features'] = pd.concat( # self.data_dictionary[f'{set_}_features'] = pd.concat(
[compute_df, inlier_metric], axis=1) # [compute_df, inlier_metric], axis=1)
else: # else:
self.data_dictionary['prediction_features'] = pd.concat( # self.data_dictionary['prediction_features'] = pd.concat(
[compute_df, inlier_metric], axis=1) # [compute_df, inlier_metric], axis=1)
self.data_dictionary['prediction_features'].fillna(0, inplace=True) # self.data_dictionary['prediction_features'].fillna(0, inplace=True)
logger.info('Inlier metric computed and added to features.') # logger.info('Inlier metric computed and added to features.')
return None # return None
def remove_beginning_points_from_data_dict(self, set_='train', no_prev_pts: int = 10): # def remove_beginning_points_from_data_dict(self, set_='train', no_prev_pts: int = 10):
features = self.data_dictionary[f'{set_}_features'] # features = self.data_dictionary[f'{set_}_features']
weights = self.data_dictionary[f'{set_}_weights'] # weights = self.data_dictionary[f'{set_}_weights']
labels = self.data_dictionary[f'{set_}_labels'] # labels = self.data_dictionary[f'{set_}_labels']
self.data_dictionary[f'{set_}_weights'] = weights[no_prev_pts:] # self.data_dictionary[f'{set_}_weights'] = weights[no_prev_pts:]
self.data_dictionary[f'{set_}_features'] = features.iloc[no_prev_pts:] # self.data_dictionary[f'{set_}_features'] = features.iloc[no_prev_pts:]
self.data_dictionary[f'{set_}_labels'] = labels.iloc[no_prev_pts:] # self.data_dictionary[f'{set_}_labels'] = labels.iloc[no_prev_pts:]
def add_noise_to_training_features(self) -> None: def add_noise_to_training_features(self) -> None:
""" """

View File

@@ -23,6 +23,8 @@ from freqtrade.freqai.data_drawer import FreqaiDataDrawer
from freqtrade.freqai.data_kitchen import FreqaiDataKitchen from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
from freqtrade.freqai.utils import get_tb_logger, plot_feature_importance, record_params from freqtrade.freqai.utils import get_tb_logger, plot_feature_importance, record_params
from freqtrade.strategy.interface import IStrategy from freqtrade.strategy.interface import IStrategy
from datasieve.pipeline import Pipeline
import datasieve.transforms as ds
pd.options.mode.chained_assignment = None pd.options.mode.chained_assignment = None
@@ -566,6 +568,32 @@ class IFreqaiModel(ABC):
if ft_params.get("use_DBSCAN_to_remove_outliers", False): if ft_params.get("use_DBSCAN_to_remove_outliers", False):
dk.use_DBSCAN_to_remove_outliers(predict=True) dk.use_DBSCAN_to_remove_outliers(predict=True)
def define_data_pipeline(self, dk: FreqaiDataKitchen) -> None:
ft_params = self.freqai_info["feature_parameters"]
dk.pipeline = Pipeline([('scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1)))])
if ft_params.get("principal_component_analysis", False):
dk.pipeline.steps += [('pca', ds.DataSievePCA())]
dk.pipeline.steps += [('post-pca-scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1)))]
if ft_params.get("use_SVM_to_remove_outliers", False):
dk.pipeline.steps += [('svm', ds.SVMOutlierExtractor())]
if ft_params.get("DI_threshold", 0):
dk.pipeline.steps += [('di', ds.DissimilarityIndex())]
if ft_params.get("use_DBSCAN_to_remove_outliers", False):
dk.pipeline.steps += [('dbscan', ds.DataSieveDBSCAN())]
dk.pipeline.fitparams = dk.pipeline._validate_fitparams({}, dk.pipeline.steps)
# if self.freqai_info["feature_parameters"].get('noise_standard_deviation', 0):
# dk.pipeline.extend(('noise', ds.Noise()))
def define_label_pipeline(self, dk: FreqaiDataKitchen) -> None:
dk.label_pipeline = Pipeline([('scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1)))])
def model_exists(self, dk: FreqaiDataKitchen) -> bool: def model_exists(self, dk: FreqaiDataKitchen) -> bool:
""" """
Given a pair and path, check if a model already exists Given a pair and path, check if a model already exists

View File

@@ -10,3 +10,4 @@ catboost==1.2; 'arm' not in platform_machine and (sys_platform != 'darwin' or py
lightgbm==3.3.5 lightgbm==3.3.5
xgboost==1.7.5 xgboost==1.7.5
tensorboard==2.13.0 tensorboard==2.13.0
datasieve==0.0.5

View File

@@ -9,9 +9,9 @@ from freqtrade.configuration import TimeRange
from freqtrade.data.dataprovider import DataProvider from freqtrade.data.dataprovider import DataProvider
from freqtrade.exceptions import OperationalException from freqtrade.exceptions import OperationalException
from freqtrade.freqai.data_kitchen import FreqaiDataKitchen from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
from tests.conftest import get_patched_exchange, log_has_re from tests.conftest import get_patched_exchange # , log_has_re
from tests.freqai.conftest import (get_patched_data_kitchen, get_patched_freqai_strategy, from tests.freqai.conftest import (get_patched_data_kitchen, get_patched_freqai_strategy,
make_data_dictionary, make_unfiltered_dataframe) make_unfiltered_dataframe) # make_data_dictionary,
from tests.freqai.test_freqai_interface import is_mac from tests.freqai.test_freqai_interface import is_mac
@@ -72,66 +72,66 @@ def test_check_if_model_expired(mocker, freqai_conf):
shutil.rmtree(Path(dk.full_path)) shutil.rmtree(Path(dk.full_path))
def test_use_DBSCAN_to_remove_outliers(mocker, freqai_conf, caplog): # def test_use_DBSCAN_to_remove_outliers(mocker, freqai_conf, caplog):
freqai = make_data_dictionary(mocker, freqai_conf) # freqai = make_data_dictionary(mocker, freqai_conf)
# freqai_conf['freqai']['feature_parameters'].update({"outlier_protection_percentage": 1}) # # freqai_conf['freqai']['feature_parameters'].update({"outlier_protection_percentage": 1})
freqai.dk.use_DBSCAN_to_remove_outliers(predict=False) # freqai.dk.use_DBSCAN_to_remove_outliers(predict=False)
assert log_has_re(r"DBSCAN found eps of 1\.7\d\.", caplog) # assert log_has_re(r"DBSCAN found eps of 1\.7\d\.", caplog)
def test_compute_distances(mocker, freqai_conf): # def test_compute_distances(mocker, freqai_conf):
freqai = make_data_dictionary(mocker, freqai_conf) # freqai = make_data_dictionary(mocker, freqai_conf)
freqai_conf['freqai']['feature_parameters'].update({"DI_threshold": 1}) # freqai_conf['freqai']['feature_parameters'].update({"DI_threshold": 1})
avg_mean_dist = freqai.dk.compute_distances() # avg_mean_dist = freqai.dk.compute_distances()
assert round(avg_mean_dist, 2) == 1.98 # assert round(avg_mean_dist, 2) == 1.98
def test_use_SVM_to_remove_outliers_and_outlier_protection(mocker, freqai_conf, caplog): # def test_use_SVM_to_remove_outliers_and_outlier_protection(mocker, freqai_conf, caplog):
freqai = make_data_dictionary(mocker, freqai_conf) # freqai = make_data_dictionary(mocker, freqai_conf)
freqai_conf['freqai']['feature_parameters'].update({"outlier_protection_percentage": 0.1}) # freqai_conf['freqai']['feature_parameters'].update({"outlier_protection_percentage": 0.1})
freqai.dk.use_SVM_to_remove_outliers(predict=False) # freqai.dk.use_SVM_to_remove_outliers(predict=False)
assert log_has_re( # assert log_has_re(
"SVM detected 7.83%", # "SVM detected 7.83%",
caplog, # caplog,
) # )
def test_compute_inlier_metric(mocker, freqai_conf, caplog): # def test_compute_inlier_metric(mocker, freqai_conf, caplog):
freqai = make_data_dictionary(mocker, freqai_conf) # freqai = make_data_dictionary(mocker, freqai_conf)
freqai_conf['freqai']['feature_parameters'].update({"inlier_metric_window": 10}) # freqai_conf['freqai']['feature_parameters'].update({"inlier_metric_window": 10})
freqai.dk.compute_inlier_metric(set_='train') # freqai.dk.compute_inlier_metric(set_='train')
assert log_has_re( # assert log_has_re(
"Inlier metric computed and added to features.", # "Inlier metric computed and added to features.",
caplog, # caplog,
) # )
def test_add_noise_to_training_features(mocker, freqai_conf): # def test_add_noise_to_training_features(mocker, freqai_conf):
freqai = make_data_dictionary(mocker, freqai_conf) # freqai = make_data_dictionary(mocker, freqai_conf)
freqai_conf['freqai']['feature_parameters'].update({"noise_standard_deviation": 0.1}) # freqai_conf['freqai']['feature_parameters'].update({"noise_standard_deviation": 0.1})
freqai.dk.add_noise_to_training_features() # freqai.dk.add_noise_to_training_features()
def test_remove_beginning_points_from_data_dict(mocker, freqai_conf): # def test_remove_beginning_points_from_data_dict(mocker, freqai_conf):
freqai = make_data_dictionary(mocker, freqai_conf) # freqai = make_data_dictionary(mocker, freqai_conf)
freqai.dk.remove_beginning_points_from_data_dict(set_='train') # freqai.dk.remove_beginning_points_from_data_dict(set_='train')
def test_principal_component_analysis(mocker, freqai_conf, caplog): # def test_principal_component_analysis(mocker, freqai_conf, caplog):
freqai = make_data_dictionary(mocker, freqai_conf) # freqai = make_data_dictionary(mocker, freqai_conf)
freqai.dk.principal_component_analysis() # freqai.dk.principal_component_analysis()
assert log_has_re( # assert log_has_re(
"reduced feature dimension by", # "reduced feature dimension by",
caplog, # caplog,
) # )
def test_normalize_data(mocker, freqai_conf): # def test_normalize_data(mocker, freqai_conf):
freqai = make_data_dictionary(mocker, freqai_conf) # freqai = make_data_dictionary(mocker, freqai_conf)
data_dict = freqai.dk.data_dictionary # data_dict = freqai.dk.data_dictionary
freqai.dk.normalize_data(data_dict) # freqai.dk.normalize_data(data_dict)
assert any('_max' in entry for entry in freqai.dk.data.keys()) # assert any('_max' in entry for entry in freqai.dk.data.keys())
assert any('_min' in entry for entry in freqai.dk.data.keys()) # assert any('_min' in entry for entry in freqai.dk.data.keys())
def test_filter_features(mocker, freqai_conf): def test_filter_features(mocker, freqai_conf):