feat: stop on 404 to prevent missing data

This commit is contained in:
Meng Xiangzhuo
2024-10-30 07:59:06 +08:00
parent b5f503dbbb
commit e49b5b03db
3 changed files with 40 additions and 10 deletions

View File

@@ -142,6 +142,7 @@ class Binance(Exchange):
timeframe=timeframe,
since_ms=since_ms,
until_ms=until_ms,
stop_on_404=True,
)
)
if df.empty:

View File

@@ -27,12 +27,18 @@ class BadHttpStatus(Exception):
async def fetch_ohlcv(
candle_type: CandleType, pair: str, timeframe: str, since_ms: int, until_ms: int | None
candle_type: CandleType,
pair: str,
timeframe: str,
since_ms: int,
until_ms: int | None,
stop_on_404: bool = False,
) -> DataFrame:
"""
Fetch OHLCV data from https://data.binance.vision/
:candle_type: Currently only spot and futures are supported
:param until_ms: `None` indicates the timestamp of the latest available data
:param stop_on_404: Stop to download the following data when a 404 returned
:return: None if no data available in the time range
"""
if candle_type == CandleType.SPOT:
@@ -51,7 +57,7 @@ async def fetch_ohlcv(
end = min(end, last_available_date)
if start >= end:
return DataFrame()
return await _fetch_ohlcv(asset_type, symbol, timeframe, start, end)
return await _fetch_ohlcv(asset_type, symbol, timeframe, start, end, stop_on_404)
def symbol_ccxt_to_binance(symbol: str) -> str:
@@ -60,7 +66,7 @@ def symbol_ccxt_to_binance(symbol: str) -> str:
e.g. BTC/USDT -> BTCUSDT, BTC/USDT:USDT -> BTCUSDT
"""
if ":" in symbol:
parts = symbol.split()
parts = symbol.split(":")
if len(parts) != 2:
raise ValueError(f"Cannot recognize symbol: {symbol}")
return parts[0].replace("/", "")
@@ -75,7 +81,14 @@ def concat(dfs) -> DataFrame:
return pd.concat(dfs)
async def _fetch_ohlcv(asset_type, symbol, timeframe, start, end) -> DataFrame:
async def _fetch_ohlcv(
asset_type: str,
symbol: str,
timeframe: str,
start: datetime.date,
end: datetime.date,
stop_on_404: bool,
) -> DataFrame:
dfs: list[DataFrame | None] = []
connector = aiohttp.TCPConnector(limit=100)
@@ -93,6 +106,9 @@ async def _fetch_ohlcv(asset_type, symbol, timeframe, start, end) -> DataFrame:
# Directly return the existing data, do not allow the gap
# between the data
return concat(dfs)
elif result is None and stop_on_404:
logger.debug("Abort downloading from data.binance.vision due to 404")
return concat(dfs)
else:
dfs.append(result)
return concat(dfs)
@@ -175,12 +191,12 @@ async def get_daily_ohlcv(
df["date"] = pd.to_datetime(df["date"], unit="ms", utc=True)
return df
elif resp.status == 404:
logger.warning(f"No data available for {symbol} in {format_date(date)}")
logger.debug(f"No data available for {symbol} in {format_date(date)}")
return None
else:
raise BadHttpStatus(f"{resp.status} - {resp.reason}")
except Exception as e:
retry += 1
if retry >= retry_count:
logger.warning(f"Failed to get data from {url}: {e}")
logger.debug(f"Failed to get data from {url}: {e}")
raise

View File

@@ -93,48 +93,61 @@ def make_response_from_url(start_date, end_date):
@pytest.mark.parametrize(
"since,until,first_date,last_date",
"since,until,first_date,last_date,stop_on_404",
[
(dt_utc(2020, 1, 1), dt_utc(2020, 1, 2), dt_utc(2020, 1, 1), dt_utc(2020, 1, 2, 23)),
(dt_utc(2020, 1, 1), dt_utc(2020, 1, 2), dt_utc(2020, 1, 1), dt_utc(2020, 1, 2, 23), False),
(
dt_utc(2020, 1, 1),
dt_utc(2020, 1, 1, 23, 59, 59),
dt_utc(2020, 1, 1),
dt_utc(2020, 1, 1, 23),
False,
),
(
dt_utc(2020, 1, 1),
dt_utc(2020, 1, 5),
dt_utc(2020, 1, 1),
dt_utc(2020, 1, 3, 23),
False,
),
(
dt_utc(2019, 1, 1),
dt_utc(2020, 1, 5),
dt_utc(2020, 1, 1),
dt_utc(2020, 1, 3, 23),
False,
),
(
dt_utc(2019, 1, 1),
dt_utc(2019, 1, 5),
None,
None,
False,
),
(
dt_utc(2021, 1, 1),
dt_utc(2021, 1, 5),
None,
None,
False,
),
(
dt_utc(2020, 1, 2),
None,
dt_utc(2020, 1, 2),
dt_utc(2020, 1, 3, 23),
False,
),
(
dt_utc(2019, 1, 1),
dt_utc(2020, 1, 5),
None,
None,
True,
),
],
)
async def test_fetch_ohlcv(mocker, since, until, first_date, last_date):
async def test_fetch_ohlcv(mocker, since, until, first_date, last_date, stop_on_404):
history_start = dt_utc(2020, 1, 1).date()
history_end = dt_utc(2020, 1, 3).date()
candle_type = CandleType.SPOT
@@ -147,7 +160,7 @@ async def test_fetch_ohlcv(mocker, since, until, first_date, last_date):
mocker.patch(
"aiohttp.ClientSession.get", side_effect=make_response_from_url(history_start, history_end)
)
df = await fetch_ohlcv(candle_type, pair, timeframe, since_ms, until_ms)
df = await fetch_ohlcv(candle_type, pair, timeframe, since_ms, until_ms, stop_on_404)
if df.empty:
assert first_date is None and last_date is None