Merge pull request #12201 from mihalt/fix_merge_informative_pair

Fix the truncation of values by merge_ordered in merge_informative_pair
This commit is contained in:
Matthias
2025-10-12 10:15:47 +02:00
committed by GitHub
2 changed files with 42 additions and 2 deletions

View File

@@ -92,6 +92,21 @@ def merge_informative_pair(
right_on=date_merge,
how="left",
)
if len(dataframe) > 1 and len(informative) > 0 and pd.isnull(dataframe.at[0, date_merge]):
# If the start dates of the dataframes are not aligned, the first rows will be NaN
# We can fill these with the last available informative candle before the start date
# while still avoiding lookahead bias - as only past data is used.
first_valid_idx = dataframe[date_merge].first_valid_index()
if first_valid_idx:
first_valid_date_merge = dataframe.at[first_valid_idx, date_merge]
matching_informative_raws = informative[
informative[date_merge] < first_valid_date_merge
]
if not matching_informative_raws.empty:
dataframe.loc[: first_valid_idx - 1] = dataframe.loc[
: first_valid_idx - 1
].fillna(matching_informative_raws.iloc[-1])
else:
dataframe = pd.merge(
dataframe, informative, left_on="date", right_on=date_merge, how="left"

View File

@@ -34,7 +34,8 @@ def test_merge_informative_pair():
assert "volume_1h" in result.columns
assert result["volume"].equals(data["volume"])
# First 3 rows are empty
# First 3 rows are empty.
# Pre-fillup doesn't happen as there is no prior candlw in the informative dataframe
assert result.iloc[0]["date_1h"] is pd.NaT
assert result.iloc[1]["date_1h"] is pd.NaT
assert result.iloc[2]["date_1h"] is pd.NaT
@@ -109,13 +110,37 @@ def test_merge_informative_pair_monthly():
# Candle is empty, as the start-date did fail.
candle3 = result.loc[(result["date"] == "2022-11-30T22:00:00.000Z")]
assert candle3.iloc[0]["date"] == pd.Timestamp("2022-11-30T22:00:00.000Z")
assert candle3.iloc[0]["date_1M"] is pd.NaT
# Merged on prior month
assert candle3.iloc[0]["date_1M"] == pd.Timestamp("2022-10-01T00:00:00.000Z")
# First candle with 1M data merged.
candle4 = result.loc[(result["date"] == "2022-11-30T23:00:00.000Z")]
assert candle4.iloc[0]["date"] == pd.Timestamp("2022-11-30T23:00:00.000Z")
assert candle4.iloc[0]["date_1M"] == pd.Timestamp("2022-11-01T00:00:00.000Z")
# Very first candle in the result dataframe
# Merged the latest informative candle before the start-date
candle5 = result.iloc[0]
assert candle5["date"] == pd.Timestamp("2022-11-28T00:00:00.000Z")
assert candle5["date_1M"] == pd.Timestamp("2022-10-01T00:00:00.000Z")
def test_merge_informative_pair_no_overlap():
# Covers roughly a day
data = generate_test_data("1m", 1440, "2022-11-28")
# Data stops WAY before the main data starts
informative = generate_test_data("1h", 40, "2022-11-01")
result = merge_informative_pair(data, informative, "1m", "1h", ffill=True)
assert isinstance(result, pd.DataFrame)
assert len(result) == len(data)
assert "date" in result.columns
assert result["date"].equals(data["date"])
assert "date_1h" in result.columns
# If there's no overlap, forward filling should not fill anything
assert result["date_1h"].isnull().all()
def test_merge_informative_pair_same():
data = generate_test_data("15m", 40)