From 5ac141f72b2df55d4ef9444a746860f73a82b8e6 Mon Sep 17 00:00:00 2001
From: robcaulk <rob.caulk@gmail.com>
Date: Tue, 6 Jun 2023 21:05:51 +0200
Subject: [PATCH] convert to new datasieve api

---
 docs/freqai-feature-engineering.md            | 35 ++-----------------
 freqtrade/freqai/freqai_interface.py          | 22 ++++++------
 .../prediction_models/XGBoostRegressor.py     |  6 ++--
 freqtrade/freqai/transforms/__init__.py       |  6 ----
 .../freqai/transforms/quantile_transform.py   | 28 ---------------
 requirements-freqai.txt                       |  2 +-
 6 files changed, 18 insertions(+), 81 deletions(-)
 delete mode 100644 freqtrade/freqai/transforms/__init__.py
 delete mode 100644 freqtrade/freqai/transforms/quantile_transform.py

diff --git a/docs/freqai-feature-engineering.md b/docs/freqai-feature-engineering.md
index eb4b4272e..0eee0793b 100644
--- a/docs/freqai-feature-engineering.md
+++ b/docs/freqai-feature-engineering.md
@@ -254,47 +254,18 @@ Users are encouraged to customize the data pipeline to their needs by building t
         """
         User defines their custom eature pipeline here (if they wish)
         """
-        from freqtrade.freqai.transforms import FreqaiQuantileTransformer
+        from sklearn.preprocessing import QuantileTransformer
         dk.feature_pipeline = Pipeline([
-            ('qt', FreqaiQuantileTransformer(output_distribution='normal'))
+            ('qt', SKLearnWrapper(QuantileTransformer(output_distribution='normal')))
         ])
 
         return
 ```
 
-Here, you are defining the exact pipeline that will be used for your feature set during training and prediction. If you have a custom step that you would like to add to the pipeline, you simply create a class that follows the DataSieve/SKLearn API. That means your step must have a `fit()`, `transform()`, `fit_transform()`, and `inverse_transform()` method. You can see examples of this in the `freqtrade.freqai.transforms` module where we use SKLearn `QuantileNormalization` to create a new step for the pipeline.
+Here, you are defining the exact pipeline that will be used for your feature set during training and prediction. Here you can use *most* SKLearn transformation steps by wrapping them in the `SKLearnWrapper` class.
 
 As there is the `feature_pipeline`, there also exists a definition for the `label_pipeline` which can be defined the same way as the `feature_pipeline`, by overriding `define_label_pipeline`.
 
-!!! note "Inheritence required"
-    While most SKLearn methods are very easy to override, as shown in freqtrade/freqai/transforms/quantile_transform.py, they still need to include passing X, y, and sample_weights through all `fit()`, `transform()`, `fit_transform()` and `inverse_transform()` functions, even if that means a direct pass through without modifications.
-
-<!-- ## Data dimensionality reduction with Principal Component Analysis
-
-You can reduce the dimensionality of your features by activating the `principal_component_analysis` in the config:
-
-```json
-    "freqai": {
-        "feature_parameters" : {
-            "principal_component_analysis": true
-        }
-    }
-```
-
-This will perform PCA on the features and reduce their dimensionality so that the explained variance of the data set is >= 0.999. Reducing data dimensionality makes training the model faster and hence allows for more up-to-date models. 
-
-## Inlier metric
-
-The `inlier_metric` is a metric aimed at quantifying how similar the features of a data point are to the most recent historical data points. 
-
-You define the lookback window by setting `inlier_metric_window` and FreqAI computes the distance between the present time point and each of the previous `inlier_metric_window` lookback points. A Weibull function is fit to each of the lookback distributions and its cumulative distribution function (CDF) is used to produce a quantile for each lookback point. The `inlier_metric` is then computed for each time point as the average of the corresponding lookback quantiles. The figure below explains the concept for an `inlier_metric_window` of 5.
-
-![inlier-metric](assets/freqai_inlier-metric.jpg)
-
-FreqAI adds the `inlier_metric` to the training features and hence gives the model access to a novel type of temporal information. 
-
-This function does **not** remove outliers from the data set. -->
-
 ## Outlier detection
 
 Equity and crypto markets suffer from a high level of non-patterned noise in the form of outlier data points. FreqAI implements a variety of methods to identify such outliers and hence mitigate risk.
diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py
index 3f04b17fb..ffe0ee8c3 100644
--- a/freqtrade/freqai/freqai_interface.py
+++ b/freqtrade/freqai/freqai_interface.py
@@ -12,8 +12,10 @@ import numpy as np
 import pandas as pd
 import psutil
 from datasieve.pipeline import Pipeline
+from datasieve.transforms import SKLearnWrapper
 from numpy.typing import NDArray
 from pandas import DataFrame
+from sklearn.preprocessing import MinMaxScaler
 
 from freqtrade.configuration import TimeRange
 from freqtrade.constants import Config
@@ -509,25 +511,25 @@ class IFreqaiModel(ABC):
         ft_params = self.freqai_info["feature_parameters"]
         dk.feature_pipeline = Pipeline([
             ('const', ds.DataSieveVarianceThreshold(threshold=0)),
-            ('scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1)))
+            ('scaler', SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1))))
             ])
 
         if ft_params.get("principal_component_analysis", False):
-            dk.feature_pipeline.steps += [('pca', ds.DataSievePCA())]
-            dk.feature_pipeline.steps += [('post-pca-scaler',
-                                           ds.DataSieveMinMaxScaler(feature_range=(-1, 1)))]
+            dk.feature_pipeline.append(('pca', ds.DataSievePCA()))
+            dk.feature_pipeline.append(('post-pca-scaler',
+                                        SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1)))))
 
         if ft_params.get("use_SVM_to_remove_outliers", False):
             svm_params = ft_params.get(
                 "svm_params", {"shuffle": False, "nu": 0.01})
-            dk.feature_pipeline.steps += [('svm', ds.SVMOutlierExtractor(**svm_params))]
+            dk.feature_pipeline.append(('svm', ds.SVMOutlierExtractor(**svm_params)))
 
         di = ft_params.get("DI_threshold", 0)
         if di:
-            dk.feature_pipeline.steps += [('di', ds.DissimilarityIndex(di_threshold=di))]
+            dk.feature_pipeline.append(('di', ds.DissimilarityIndex(di_threshold=di)))
 
         if ft_params.get("use_DBSCAN_to_remove_outliers", False):
-            dk.feature_pipeline.steps += [('dbscan', ds.DataSieveDBSCAN())]
+            dk.feature_pipeline.append(('dbscan', ds.DataSieveDBSCAN()))
 
         dk.feature_pipeline.fitparams = dk.feature_pipeline._validate_fitparams(
             {}, dk.feature_pipeline.steps)
@@ -538,7 +540,7 @@ class IFreqaiModel(ABC):
     def define_label_pipeline(self, dk: FreqaiDataKitchen) -> None:
 
         dk.label_pipeline = Pipeline([
-            ('scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1)))
+            ('scaler', SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1))))
             ])
 
     def model_exists(self, dk: FreqaiDataKitchen) -> bool:
@@ -551,8 +553,6 @@ class IFreqaiModel(ABC):
         """
         if self.dd.model_type == 'joblib':
             file_type = ".joblib"
-        elif self.dd.model_type == 'keras':
-            file_type = ".h5"
         elif self.dd.model_type in ["stable_baselines3", "sb3_contrib", "pytorch"]:
             file_type = ".zip"
 
@@ -676,7 +676,7 @@ class IFreqaiModel(ABC):
 
         # # for keras type models, the conv_window needs to be prepended so
         # # viewing is correct in frequi
-        if self.freqai_info.get('keras', False) or self.ft_params.get('inlier_metric_window', 0):
+        if self.ft_params.get('inlier_metric_window', 0):
             n_lost_points = self.freqai_info.get('conv_width', 2)
             zeros_df = DataFrame(np.zeros((n_lost_points, len(hist_preds_df.columns))),
                                  columns=hist_preds_df.columns)
diff --git a/freqtrade/freqai/prediction_models/XGBoostRegressor.py b/freqtrade/freqai/prediction_models/XGBoostRegressor.py
index 88d348448..19c051b91 100644
--- a/freqtrade/freqai/prediction_models/XGBoostRegressor.py
+++ b/freqtrade/freqai/prediction_models/XGBoostRegressor.py
@@ -9,7 +9,7 @@ from freqtrade.freqai.tensorboard import TBCallback
 
 
 # from datasieve.pipeline import Pipeline
-# from freqtrade.freqai.transforms import FreqaiQuantileTransformer
+# from sklearn.preprocessing import QuantileTransformer
 
 logger = logging.getLogger(__name__)
 
@@ -61,7 +61,7 @@ class XGBoostRegressor(BaseRegressionModel):
     #     User defines their custom eature pipeline here (if they wish)
     #     """
     #     dk.feature_pipeline = Pipeline([
-    #         ('qt', FreqaiQuantileTransformer(output_distribution='normal'))
+    #         ('qt', SKLearnWrapper(QuantileTransformer(output_distribution='normal')))
     #     ])
 
     #     return
@@ -71,7 +71,7 @@ class XGBoostRegressor(BaseRegressionModel):
     #     User defines their custom label pipeline here (if they wish)
     #     """
     #     dk.label_pipeline = Pipeline([
-    #         ('qt', FreqaiQuantileTransformer(output_distribution='normal'))
+    #          ('qt', SKLearnWrapper(QuantileTransformer(output_distribution='normal')))
     #     ])
 
     #     return
diff --git a/freqtrade/freqai/transforms/__init__.py b/freqtrade/freqai/transforms/__init__.py
deleted file mode 100644
index 9b7d8ccf5..000000000
--- a/freqtrade/freqai/transforms/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from freqtrade.freqai.transforms.quantile_transform import FreqaiQuantileTransformer
-
-
-__all__ = (
-    "FreqaiQuantileTransformer",
-)
diff --git a/freqtrade/freqai/transforms/quantile_transform.py b/freqtrade/freqai/transforms/quantile_transform.py
deleted file mode 100644
index 3d1bd2731..000000000
--- a/freqtrade/freqai/transforms/quantile_transform.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from sklearn.preprocessing import QuantileTransformer
-
-
-class FreqaiQuantileTransformer(QuantileTransformer):
-    """
-    A subclass of the SKLearn Quantile that ensures fit, transform, fit_transform and
-    inverse_transform all take the full set of params X, y, sample_weight required to
-    benefit from the DataSieve features.
-    """
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-    def fit_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
-        super().fit(X)
-        X = super().transform(X)
-        return X, y, sample_weight, feature_list
-
-    def fit(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
-        super().fit(X)
-        return X, y, sample_weight, feature_list
-
-    def transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
-        X = super().transform(X)
-        return X, y, sample_weight, feature_list
-
-    def inverse_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
-        return super().inverse_transform(X), y, sample_weight, feature_list
diff --git a/requirements-freqai.txt b/requirements-freqai.txt
index 31c73b594..748950e24 100644
--- a/requirements-freqai.txt
+++ b/requirements-freqai.txt
@@ -10,4 +10,4 @@ catboost==1.2; 'arm' not in platform_machine and (sys_platform != 'darwin' or py
 lightgbm==3.3.5
 xgboost==1.7.5
 tensorboard==2.13.0
-datasieve==0.1.0
+datasieve==0.1.1