convert to new datasieve api

2026-01-20 05:50:36 +00:00 · 2023-06-06 21:05:51 +02:00
parent f6a32f4ffd
commit 5ac141f72b
6 changed files with 18 additions and 81 deletions
--- a/docs/freqai-feature-engineering.md
+++ b/docs/freqai-feature-engineering.md
@@ -254,47 +254,18 @@ Users are encouraged to customize the data pipeline to their needs by building t
        """
        User defines their custom eature pipeline here (if they wish)
        """
-        from freqtrade.freqai.transforms import FreqaiQuantileTransformer
+        from sklearn.preprocessing import QuantileTransformer
        dk.feature_pipeline = Pipeline([
-            ('qt', FreqaiQuantileTransformer(output_distribution='normal'))
+            ('qt', SKLearnWrapper(QuantileTransformer(output_distribution='normal')))
        ])

        return
 ```

-Here, you are defining the exact pipeline that will be used for your feature set during training and prediction. If you have a custom step that you would like to add to the pipeline, you simply create a class that follows the DataSieve/SKLearn API. That means your step must have a `fit()`, `transform()`, `fit_transform()`, and `inverse_transform()` method. You can see examples of this in the `freqtrade.freqai.transforms` module where we use SKLearn `QuantileNormalization` to create a new step for the pipeline.
+Here, you are defining the exact pipeline that will be used for your feature set during training and prediction. Here you can use *most* SKLearn transformation steps by wrapping them in the `SKLearnWrapper` class.

 As there is the `feature_pipeline`, there also exists a definition for the `label_pipeline` which can be defined the same way as the `feature_pipeline`, by overriding `define_label_pipeline`.

-!!! note "Inheritence required"
-    While most SKLearn methods are very easy to override, as shown in freqtrade/freqai/transforms/quantile_transform.py, they still need to include passing X, y, and sample_weights through all `fit()`, `transform()`, `fit_transform()` and `inverse_transform()` functions, even if that means a direct pass through without modifications.
-
-<!-- ## Data dimensionality reduction with Principal Component Analysis
-
-You can reduce the dimensionality of your features by activating the `principal_component_analysis` in the config:
-
-```json
-    "freqai": {
-        "feature_parameters" : {
-            "principal_component_analysis": true
-        }
-    }
-```
-
-This will perform PCA on the features and reduce their dimensionality so that the explained variance of the data set is >= 0.999. Reducing data dimensionality makes training the model faster and hence allows for more up-to-date models. 
-
-## Inlier metric
-
-The `inlier_metric` is a metric aimed at quantifying how similar the features of a data point are to the most recent historical data points. 
-
-You define the lookback window by setting `inlier_metric_window` and FreqAI computes the distance between the present time point and each of the previous `inlier_metric_window` lookback points. A Weibull function is fit to each of the lookback distributions and its cumulative distribution function (CDF) is used to produce a quantile for each lookback point. The `inlier_metric` is then computed for each time point as the average of the corresponding lookback quantiles. The figure below explains the concept for an `inlier_metric_window` of 5.
-
-![inlier-metric](assets/freqai_inlier-metric.jpg)
-
-FreqAI adds the `inlier_metric` to the training features and hence gives the model access to a novel type of temporal information. 
-
-This function does **not** remove outliers from the data set. -->
-
 ## Outlier detection

 Equity and crypto markets suffer from a high level of non-patterned noise in the form of outlier data points. FreqAI implements a variety of methods to identify such outliers and hence mitigate risk.
--- a/freqtrade/freqai/freqai_interface.py
+++ b/freqtrade/freqai/freqai_interface.py
@@ -12,8 +12,10 @@ import numpy as np
 import pandas as pd
 import psutil
 from datasieve.pipeline import Pipeline
+from datasieve.transforms import SKLearnWrapper
 from numpy.typing import NDArray
 from pandas import DataFrame
+from sklearn.preprocessing import MinMaxScaler

 from freqtrade.configuration import TimeRange
 from freqtrade.constants import Config
@@ -509,25 +511,25 @@ class IFreqaiModel(ABC):
        ft_params = self.freqai_info["feature_parameters"]
        dk.feature_pipeline = Pipeline([
            ('const', ds.DataSieveVarianceThreshold(threshold=0)),
-            ('scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1)))
+            ('scaler', SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1))))
            ])

        if ft_params.get("principal_component_analysis", False):
-            dk.feature_pipeline.steps += [('pca', ds.DataSievePCA())]
-            dk.feature_pipeline.steps += [('post-pca-scaler',
-                                           ds.DataSieveMinMaxScaler(feature_range=(-1, 1)))]
+            dk.feature_pipeline.append(('pca', ds.DataSievePCA()))
+            dk.feature_pipeline.append(('post-pca-scaler',
+                                        SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1)))))

        if ft_params.get("use_SVM_to_remove_outliers", False):
            svm_params = ft_params.get(
                "svm_params", {"shuffle": False, "nu": 0.01})
-            dk.feature_pipeline.steps += [('svm', ds.SVMOutlierExtractor(**svm_params))]
+            dk.feature_pipeline.append(('svm', ds.SVMOutlierExtractor(**svm_params)))

        di = ft_params.get("DI_threshold", 0)
        if di:
-            dk.feature_pipeline.steps += [('di', ds.DissimilarityIndex(di_threshold=di))]
+            dk.feature_pipeline.append(('di', ds.DissimilarityIndex(di_threshold=di)))

        if ft_params.get("use_DBSCAN_to_remove_outliers", False):
-            dk.feature_pipeline.steps += [('dbscan', ds.DataSieveDBSCAN())]
+            dk.feature_pipeline.append(('dbscan', ds.DataSieveDBSCAN()))

        dk.feature_pipeline.fitparams = dk.feature_pipeline._validate_fitparams(
            {}, dk.feature_pipeline.steps)
@@ -538,7 +540,7 @@ class IFreqaiModel(ABC):
    def define_label_pipeline(self, dk: FreqaiDataKitchen) -> None:

        dk.label_pipeline = Pipeline([
-            ('scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1)))
+            ('scaler', SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1))))
            ])

    def model_exists(self, dk: FreqaiDataKitchen) -> bool:
@@ -551,8 +553,6 @@ class IFreqaiModel(ABC):
        """
        if self.dd.model_type == 'joblib':
            file_type = ".joblib"
-        elif self.dd.model_type == 'keras':
-            file_type = ".h5"
        elif self.dd.model_type in ["stable_baselines3", "sb3_contrib", "pytorch"]:
            file_type = ".zip"

@@ -676,7 +676,7 @@ class IFreqaiModel(ABC):

        # # for keras type models, the conv_window needs to be prepended so
        # # viewing is correct in frequi
-        if self.freqai_info.get('keras', False) or self.ft_params.get('inlier_metric_window', 0):
+        if self.ft_params.get('inlier_metric_window', 0):
            n_lost_points = self.freqai_info.get('conv_width', 2)
            zeros_df = DataFrame(np.zeros((n_lost_points, len(hist_preds_df.columns))),
                                 columns=hist_preds_df.columns)
--- a/freqtrade/freqai/prediction_models/XGBoostRegressor.py
+++ b/freqtrade/freqai/prediction_models/XGBoostRegressor.py
@@ -9,7 +9,7 @@ from freqtrade.freqai.tensorboard import TBCallback


 # from datasieve.pipeline import Pipeline
-# from freqtrade.freqai.transforms import FreqaiQuantileTransformer
+# from sklearn.preprocessing import QuantileTransformer

 logger = logging.getLogger(__name__)

@@ -61,7 +61,7 @@ class XGBoostRegressor(BaseRegressionModel):
    #     User defines their custom eature pipeline here (if they wish)
    #     """
    #     dk.feature_pipeline = Pipeline([
-    #         ('qt', FreqaiQuantileTransformer(output_distribution='normal'))
+    #         ('qt', SKLearnWrapper(QuantileTransformer(output_distribution='normal')))
    #     ])

    #     return
@@ -71,7 +71,7 @@ class XGBoostRegressor(BaseRegressionModel):
    #     User defines their custom label pipeline here (if they wish)
    #     """
    #     dk.label_pipeline = Pipeline([
-    #         ('qt', FreqaiQuantileTransformer(output_distribution='normal'))
+    #          ('qt', SKLearnWrapper(QuantileTransformer(output_distribution='normal')))
    #     ])

    #     return
--- a/freqtrade/freqai/transforms/init.py
+++ b/freqtrade/freqai/transforms/init.py
@@ -1,6 +0,0 @@
-from freqtrade.freqai.transforms.quantile_transform import FreqaiQuantileTransformer
-
-
-__all__ = (
-    "FreqaiQuantileTransformer",
-)
--- a/freqtrade/freqai/transforms/quantile_transform.py
+++ b/freqtrade/freqai/transforms/quantile_transform.py
@@ -1,28 +0,0 @@
-from sklearn.preprocessing import QuantileTransformer
-
-
-class FreqaiQuantileTransformer(QuantileTransformer):
-    """
-    A subclass of the SKLearn Quantile that ensures fit, transform, fit_transform and
-    inverse_transform all take the full set of params X, y, sample_weight required to
-    benefit from the DataSieve features.
-    """
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-    def fit_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
-        super().fit(X)
-        X = super().transform(X)
-        return X, y, sample_weight, feature_list
-
-    def fit(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
-        super().fit(X)
-        return X, y, sample_weight, feature_list
-
-    def transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
-        X = super().transform(X)
-        return X, y, sample_weight, feature_list
-
-    def inverse_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
-        return super().inverse_transform(X), y, sample_weight, feature_list
--- a/requirements-freqai.txt
+++ b/requirements-freqai.txt
@@ -10,4 +10,4 @@ catboost==1.2; 'arm' not in platform_machine and (sys_platform != 'darwin' or py
 lightgbm==3.3.5
 xgboost==1.7.5
 tensorboard==2.13.0
-datasieve==0.1.0
+datasieve==0.1.1