convert to new datasieve api

This commit is contained in:
robcaulk
2023-06-06 21:05:51 +02:00
parent f6a32f4ffd
commit 5ac141f72b
6 changed files with 18 additions and 81 deletions

View File

@@ -254,47 +254,18 @@ Users are encouraged to customize the data pipeline to their needs by building t
"""
User defines their custom eature pipeline here (if they wish)
"""
from freqtrade.freqai.transforms import FreqaiQuantileTransformer
from sklearn.preprocessing import QuantileTransformer
dk.feature_pipeline = Pipeline([
('qt', FreqaiQuantileTransformer(output_distribution='normal'))
('qt', SKLearnWrapper(QuantileTransformer(output_distribution='normal')))
])
return
```
Here, you are defining the exact pipeline that will be used for your feature set during training and prediction. If you have a custom step that you would like to add to the pipeline, you simply create a class that follows the DataSieve/SKLearn API. That means your step must have a `fit()`, `transform()`, `fit_transform()`, and `inverse_transform()` method. You can see examples of this in the `freqtrade.freqai.transforms` module where we use SKLearn `QuantileNormalization` to create a new step for the pipeline.
Here, you are defining the exact pipeline that will be used for your feature set during training and prediction. Here you can use *most* SKLearn transformation steps by wrapping them in the `SKLearnWrapper` class.
As there is the `feature_pipeline`, there also exists a definition for the `label_pipeline` which can be defined the same way as the `feature_pipeline`, by overriding `define_label_pipeline`.
!!! note "Inheritence required"
While most SKLearn methods are very easy to override, as shown in freqtrade/freqai/transforms/quantile_transform.py, they still need to include passing X, y, and sample_weights through all `fit()`, `transform()`, `fit_transform()` and `inverse_transform()` functions, even if that means a direct pass through without modifications.
<!-- ## Data dimensionality reduction with Principal Component Analysis
You can reduce the dimensionality of your features by activating the `principal_component_analysis` in the config:
```json
"freqai": {
"feature_parameters" : {
"principal_component_analysis": true
}
}
```
This will perform PCA on the features and reduce their dimensionality so that the explained variance of the data set is >= 0.999. Reducing data dimensionality makes training the model faster and hence allows for more up-to-date models.
## Inlier metric
The `inlier_metric` is a metric aimed at quantifying how similar the features of a data point are to the most recent historical data points.
You define the lookback window by setting `inlier_metric_window` and FreqAI computes the distance between the present time point and each of the previous `inlier_metric_window` lookback points. A Weibull function is fit to each of the lookback distributions and its cumulative distribution function (CDF) is used to produce a quantile for each lookback point. The `inlier_metric` is then computed for each time point as the average of the corresponding lookback quantiles. The figure below explains the concept for an `inlier_metric_window` of 5.
![inlier-metric](assets/freqai_inlier-metric.jpg)
FreqAI adds the `inlier_metric` to the training features and hence gives the model access to a novel type of temporal information.
This function does **not** remove outliers from the data set. -->
## Outlier detection
Equity and crypto markets suffer from a high level of non-patterned noise in the form of outlier data points. FreqAI implements a variety of methods to identify such outliers and hence mitigate risk.

View File

@@ -12,8 +12,10 @@ import numpy as np
import pandas as pd
import psutil
from datasieve.pipeline import Pipeline
from datasieve.transforms import SKLearnWrapper
from numpy.typing import NDArray
from pandas import DataFrame
from sklearn.preprocessing import MinMaxScaler
from freqtrade.configuration import TimeRange
from freqtrade.constants import Config
@@ -509,25 +511,25 @@ class IFreqaiModel(ABC):
ft_params = self.freqai_info["feature_parameters"]
dk.feature_pipeline = Pipeline([
('const', ds.DataSieveVarianceThreshold(threshold=0)),
('scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1)))
('scaler', SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1))))
])
if ft_params.get("principal_component_analysis", False):
dk.feature_pipeline.steps += [('pca', ds.DataSievePCA())]
dk.feature_pipeline.steps += [('post-pca-scaler',
ds.DataSieveMinMaxScaler(feature_range=(-1, 1)))]
dk.feature_pipeline.append(('pca', ds.DataSievePCA()))
dk.feature_pipeline.append(('post-pca-scaler',
SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1)))))
if ft_params.get("use_SVM_to_remove_outliers", False):
svm_params = ft_params.get(
"svm_params", {"shuffle": False, "nu": 0.01})
dk.feature_pipeline.steps += [('svm', ds.SVMOutlierExtractor(**svm_params))]
dk.feature_pipeline.append(('svm', ds.SVMOutlierExtractor(**svm_params)))
di = ft_params.get("DI_threshold", 0)
if di:
dk.feature_pipeline.steps += [('di', ds.DissimilarityIndex(di_threshold=di))]
dk.feature_pipeline.append(('di', ds.DissimilarityIndex(di_threshold=di)))
if ft_params.get("use_DBSCAN_to_remove_outliers", False):
dk.feature_pipeline.steps += [('dbscan', ds.DataSieveDBSCAN())]
dk.feature_pipeline.append(('dbscan', ds.DataSieveDBSCAN()))
dk.feature_pipeline.fitparams = dk.feature_pipeline._validate_fitparams(
{}, dk.feature_pipeline.steps)
@@ -538,7 +540,7 @@ class IFreqaiModel(ABC):
def define_label_pipeline(self, dk: FreqaiDataKitchen) -> None:
dk.label_pipeline = Pipeline([
('scaler', ds.DataSieveMinMaxScaler(feature_range=(-1, 1)))
('scaler', SKLearnWrapper(MinMaxScaler(feature_range=(-1, 1))))
])
def model_exists(self, dk: FreqaiDataKitchen) -> bool:
@@ -551,8 +553,6 @@ class IFreqaiModel(ABC):
"""
if self.dd.model_type == 'joblib':
file_type = ".joblib"
elif self.dd.model_type == 'keras':
file_type = ".h5"
elif self.dd.model_type in ["stable_baselines3", "sb3_contrib", "pytorch"]:
file_type = ".zip"
@@ -676,7 +676,7 @@ class IFreqaiModel(ABC):
# # for keras type models, the conv_window needs to be prepended so
# # viewing is correct in frequi
if self.freqai_info.get('keras', False) or self.ft_params.get('inlier_metric_window', 0):
if self.ft_params.get('inlier_metric_window', 0):
n_lost_points = self.freqai_info.get('conv_width', 2)
zeros_df = DataFrame(np.zeros((n_lost_points, len(hist_preds_df.columns))),
columns=hist_preds_df.columns)

View File

@@ -9,7 +9,7 @@ from freqtrade.freqai.tensorboard import TBCallback
# from datasieve.pipeline import Pipeline
# from freqtrade.freqai.transforms import FreqaiQuantileTransformer
# from sklearn.preprocessing import QuantileTransformer
logger = logging.getLogger(__name__)
@@ -61,7 +61,7 @@ class XGBoostRegressor(BaseRegressionModel):
# User defines their custom eature pipeline here (if they wish)
# """
# dk.feature_pipeline = Pipeline([
# ('qt', FreqaiQuantileTransformer(output_distribution='normal'))
# ('qt', SKLearnWrapper(QuantileTransformer(output_distribution='normal')))
# ])
# return
@@ -71,7 +71,7 @@ class XGBoostRegressor(BaseRegressionModel):
# User defines their custom label pipeline here (if they wish)
# """
# dk.label_pipeline = Pipeline([
# ('qt', FreqaiQuantileTransformer(output_distribution='normal'))
# ('qt', SKLearnWrapper(QuantileTransformer(output_distribution='normal')))
# ])
# return

View File

@@ -1,6 +0,0 @@
from freqtrade.freqai.transforms.quantile_transform import FreqaiQuantileTransformer
__all__ = (
"FreqaiQuantileTransformer",
)

View File

@@ -1,28 +0,0 @@
from sklearn.preprocessing import QuantileTransformer
class FreqaiQuantileTransformer(QuantileTransformer):
"""
A subclass of the SKLearn Quantile that ensures fit, transform, fit_transform and
inverse_transform all take the full set of params X, y, sample_weight required to
benefit from the DataSieve features.
"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
def fit_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
super().fit(X)
X = super().transform(X)
return X, y, sample_weight, feature_list
def fit(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
super().fit(X)
return X, y, sample_weight, feature_list
def transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
X = super().transform(X)
return X, y, sample_weight, feature_list
def inverse_transform(self, X, y=None, sample_weight=None, feature_list=None, **kwargs):
return super().inverse_transform(X), y, sample_weight, feature_list

View File

@@ -10,4 +10,4 @@ catboost==1.2; 'arm' not in platform_machine and (sys_platform != 'darwin' or py
lightgbm==3.3.5
xgboost==1.7.5
tensorboard==2.13.0
datasieve==0.1.0
datasieve==0.1.1