From 7e1e388b9ce492d390671df30b70ceabd5415e06 Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 23 Sep 2022 18:24:30 +0200 Subject: [PATCH] Add feather/parquet docs --- docs/data-download.md | 44 ++++++++++++++++++-- freqtrade/data/history/featherdatahandler.py | 9 ++-- freqtrade/data/history/parquetdatahandler.py | 9 ++-- 3 files changed, 47 insertions(+), 15 deletions(-) diff --git a/docs/data-download.md b/docs/data-download.md index 2b76d4f74..60e3f5efe 100644 --- a/docs/data-download.md +++ b/docs/data-download.md @@ -179,9 +179,11 @@ freqtrade download-data --exchange binance --pairs ETH/USDT XRP/USDT BTC/USDT -- Freqtrade currently supports 3 data-formats for both OHLCV and trades data: -* `json` (plain "text" json files) -* `jsongz` (a gzip-zipped version of json files) -* `hdf5` (a high performance datastore) +* `json` - plain "text" json files +* `jsongz` - a gzip-zipped version of json files +* `hdf5` - a high performance datastore +* `feather` - a dataformat based on Apache Arrow +* `parquet` - columnar datastore By default, OHLCV data is stored as `json` data, while trades data is stored as `jsongz` data. @@ -200,6 +202,42 @@ If the default data-format has been changed during download, then the keys `data !!! Note You can convert between data-formats using the [convert-data](#sub-command-convert-data) and [convert-trade-data](#sub-command-convert-trade-data) methods. +#### Dataformat comparison + +The following comparisons have been made with the following data, and by using the linux `time` command. + +``` +Found 6 pair / timeframe combinations. ++----------+-------------+--------+---------------------+---------------------+ +| Pair | Timeframe | Type | From | To | +|----------+-------------+--------+---------------------+---------------------| +| BTC/USDT | 5m | spot | 2017-08-17 04:00:00 | 2022-09-13 19:25:00 | +| ETH/USDT | 1m | spot | 2017-08-17 04:00:00 | 2022-09-13 19:26:00 | +| BTC/USDT | 1m | spot | 2017-08-17 04:00:00 | 2022-09-13 19:30:00 | +| XRP/USDT | 5m | spot | 2018-05-04 08:10:00 | 2022-09-13 19:15:00 | +| XRP/USDT | 1m | spot | 2018-05-04 08:11:00 | 2022-09-13 19:22:00 | +| ETH/USDT | 5m | spot | 2017-08-17 04:00:00 | 2022-09-13 19:20:00 | ++----------+-------------+--------+---------------------+---------------------+ +``` + +Timings have been taken in a not very scientific way with the following command, which forces reading the data into memory. + +``` bash +time freqtrade list-data --show-timerange --data-format-ohlcv +``` + +| Format | Size | timing | +|------------|-------------|-------------| +| `json` | 149Mb | 25.6s | +| `jsongz` | 39Mb | 27s | +| `hdf5` | 145Mb | 3.9s | +| `feather` | 72Mb | 3.5s | +| `parquet` | 83Mb | 3.8s | + +Size has been taken from the BTC/USDT 1m spot combination for the timerange specified above. + +To have a best performance/size mix, we recommend the use of either feather or parquet. + #### Sub-command convert data ``` diff --git a/freqtrade/data/history/featherdatahandler.py b/freqtrade/data/history/featherdatahandler.py index dfb818ca8..22a6805e7 100644 --- a/freqtrade/data/history/featherdatahandler.py +++ b/freqtrade/data/history/featherdatahandler.py @@ -58,12 +58,9 @@ class FeatherDataHandler(IDataHandler): self._datadir, pair, timeframe, candle_type=candle_type, no_timeframe_modify=True) if not filename.exists(): return DataFrame(columns=self._columns) - try: - pairdata = read_feather(filename) - pairdata.columns = self._columns - except ValueError: - logger.error(f"Could not load data for {pair}.") - return DataFrame(columns=self._columns) + + pairdata = read_feather(filename) + pairdata.columns = self._columns pairdata = pairdata.astype(dtype={'open': 'float', 'high': 'float', 'low': 'float', 'close': 'float', 'volume': 'float'}) pairdata['date'] = to_datetime(pairdata['date'], diff --git a/freqtrade/data/history/parquetdatahandler.py b/freqtrade/data/history/parquetdatahandler.py index 283d90ec0..57581861d 100644 --- a/freqtrade/data/history/parquetdatahandler.py +++ b/freqtrade/data/history/parquetdatahandler.py @@ -57,12 +57,9 @@ class ParquetDataHandler(IDataHandler): self._datadir, pair, timeframe, candle_type=candle_type, no_timeframe_modify=True) if not filename.exists(): return DataFrame(columns=self._columns) - try: - pairdata = read_parquet(filename) - pairdata.columns = self._columns - except ValueError: - logger.error(f"Could not load data for {pair}.") - return DataFrame(columns=self._columns) + + pairdata = read_parquet(filename) + pairdata.columns = self._columns pairdata = pairdata.astype(dtype={'open': 'float', 'high': 'float', 'low': 'float', 'close': 'float', 'volume': 'float'}) pairdata['date'] = to_datetime(pairdata['date'],