diff --git a/docs/freqai-reinforcement-learning.md b/docs/freqai-reinforcement-learning.md index b1a212a92..f3d6c97f8 100644 --- a/docs/freqai-reinforcement-learning.md +++ b/docs/freqai-reinforcement-learning.md @@ -247,6 +247,32 @@ where `unique-id` is the `identifier` set in the `freqai` configuration file. Th ![tensorboard](assets/tensorboard.jpg) + +### Custom logging + +FreqAI also provides a built in episodic summary logger called `self.tensorboard_log` for adding custom information to the Tensorboard log. By default, this function is already called once per step inside the environment to record the agent actions. All values accumulated for all steps in a single episode are reported at the conclusion of each episode, followed by a full reset of all metrics to 0 in preparation for the subsequent episode. + + +`self.tensorboard_log` can also be used anywhere inside the environment, for example, it can be added to the `calculate_reward` function to collect more detailed information about how often various parts of the reward were called: + +```py + class MyRLEnv(Base5ActionRLEnv): + """ + User made custom environment. This class inherits from BaseEnvironment and gym.env. + Users can override any functions from those parent classes. Here is an example + of a user customized `calculate_reward()` function. + """ + def calculate_reward(self, action: int) -> float: + if not self._is_valid(action): + self.tensorboard_log("is_valid") + return -2 + +``` + +!!! Note + The `self.tensorboard_log()` function is designed for tracking incremented objects only i.e. events, actions inside the training environment. If the event of interest is a float, the float can be passed as the second argument e.g. `self.tensorboard_log("float_metric1", 0.23)` would add 0.23 to `float_metric`. In this case you can also disable incrementing using `inc=False` parameter. + + ### Choosing a base environment FreqAI provides two base environments, `Base4ActionEnvironment` and `Base5ActionEnvironment`. As the names imply, the environments are customized for agents that can select from 4 or 5 actions. In the `Base4ActionEnvironment`, the agent can enter long, enter short, hold neutral, or exit position. Meanwhile, in the `Base5ActionEnvironment`, the agent has the same actions as Base4, but instead of a single exit action, it separates exit long and exit short. The main changes stemming from the environment selection include: diff --git a/freqtrade/freqai/RL/Base4ActionRLEnv.py b/freqtrade/freqai/RL/Base4ActionRLEnv.py index 79616d778..a3ebfdbfa 100644 --- a/freqtrade/freqai/RL/Base4ActionRLEnv.py +++ b/freqtrade/freqai/RL/Base4ActionRLEnv.py @@ -46,9 +46,9 @@ class Base4ActionRLEnv(BaseEnvironment): self._done = True self._update_unrealized_total_profit() - step_reward = self.calculate_reward(action) self.total_reward += step_reward + self.tensorboard_log(self.actions._member_names_[action]) trade_type = None if self.is_tradesignal(action): diff --git a/freqtrade/freqai/RL/Base5ActionRLEnv.py b/freqtrade/freqai/RL/Base5ActionRLEnv.py index 1c09f9386..22d3cae30 100644 --- a/freqtrade/freqai/RL/Base5ActionRLEnv.py +++ b/freqtrade/freqai/RL/Base5ActionRLEnv.py @@ -49,6 +49,7 @@ class Base5ActionRLEnv(BaseEnvironment): self._update_unrealized_total_profit() step_reward = self.calculate_reward(action) self.total_reward += step_reward + self.tensorboard_log(self.actions._member_names_[action]) trade_type = None if self.is_tradesignal(action): diff --git a/freqtrade/freqai/RL/BaseEnvironment.py b/freqtrade/freqai/RL/BaseEnvironment.py index 86c63c382..5a5a950e7 100644 --- a/freqtrade/freqai/RL/BaseEnvironment.py +++ b/freqtrade/freqai/RL/BaseEnvironment.py @@ -2,7 +2,7 @@ import logging import random from abc import abstractmethod from enum import Enum -from typing import Optional, Type +from typing import Optional, Type, Union import gym import numpy as np @@ -78,7 +78,7 @@ class BaseEnvironment(gym.Env): # set here to default 5Ac, but all children envs can override this self.actions: Type[Enum] = BaseActions - self.custom_info: dict = {} + self.tensorboard_metrics: dict = {} self.live: bool = False if dp: self.live = dp.runmode in (RunMode.DRY_RUN, RunMode.LIVE) @@ -139,20 +139,38 @@ class BaseEnvironment(gym.Env): self.np_random, seed = seeding.np_random(seed) return [seed] + def tensorboard_log(self, metric: str, value: Union[int, float] = 1, inc: bool = True): + """ + Function builds the tensorboard_metrics dictionary + to be parsed by the TensorboardCallback. This + function is designed for tracking incremented objects, + events, actions inside the training environment. + For example, a user can call this to track the + frequency of occurence of an `is_valid` call in + their `calculate_reward()`: + + def calculate_reward(self, action: int) -> float: + if not self._is_valid(action): + self.tensorboard_log("is_valid") + return -2 + + :param metric: metric to be tracked and incremented + :param value: value to increment `metric` by + :param inc: sets whether the `value` is incremented or not + """ + if not inc or metric not in self.tensorboard_metrics: + self.tensorboard_metrics[metric] = value + else: + self.tensorboard_metrics[metric] += value + + def reset_tensorboard_log(self): + self.tensorboard_metrics = {} + def reset(self): """ Reset is called at the beginning of every episode """ - # custom_info is used for episodic reports and tensorboard logging - self.custom_info["Invalid"] = 0 - self.custom_info["Hold"] = 0 - self.custom_info["Unknown"] = 0 - self.custom_info["pnl_factor"] = 0 - self.custom_info["duration_factor"] = 0 - self.custom_info["reward_exit"] = 0 - self.custom_info["reward_hold"] = 0 - for action in self.actions: - self.custom_info[f"{action.name}"] = 0 + self.reset_tensorboard_log() self._done = False diff --git a/freqtrade/freqai/RL/TensorboardCallback.py b/freqtrade/freqai/RL/TensorboardCallback.py index f590bdf84..b596742e9 100644 --- a/freqtrade/freqai/RL/TensorboardCallback.py +++ b/freqtrade/freqai/RL/TensorboardCallback.py @@ -42,19 +42,18 @@ class TensorboardCallback(BaseCallback): ) def _on_step(self) -> bool: - custom_info = self.training_env.get_attr("custom_info")[0] - self.logger.record("_state/position", self.locals["infos"][0]["position"]) - self.logger.record("_state/trade_duration", self.locals["infos"][0]["trade_duration"]) - self.logger.record("_state/current_profit_pct", self.locals["infos"] - [0]["current_profit_pct"]) - self.logger.record("_reward/total_profit", self.locals["infos"][0]["total_profit"]) - self.logger.record("_reward/total_reward", self.locals["infos"][0]["total_reward"]) - self.logger.record_mean("_reward/mean_trade_duration", self.locals["infos"] - [0]["trade_duration"]) - self.logger.record("_actions/action", self.locals["infos"][0]["action"]) - self.logger.record("_actions/_Invalid", custom_info["Invalid"]) - self.logger.record("_actions/_Unknown", custom_info["Unknown"]) - self.logger.record("_actions/Hold", custom_info["Hold"]) - for action in self.actions: - self.logger.record(f"_actions/{action.name}", custom_info[action.name]) + + local_info = self.locals["infos"][0] + tensorboard_metrics = self.training_env.get_attr("tensorboard_metrics")[0] + + for info in local_info: + if info not in ["episode", "terminal_observation"]: + self.logger.record(f"_info/{info}", local_info[info]) + + for info in tensorboard_metrics: + if info in [action.name for action in self.actions]: + self.logger.record(f"_actions/{info}", tensorboard_metrics[info]) + else: + self.logger.record(f"_custom/{info}", tensorboard_metrics[info]) + return True diff --git a/freqtrade/freqai/prediction_models/ReinforcementLearner.py b/freqtrade/freqai/prediction_models/ReinforcementLearner.py index e62110347..2a87151f9 100644 --- a/freqtrade/freqai/prediction_models/ReinforcementLearner.py +++ b/freqtrade/freqai/prediction_models/ReinforcementLearner.py @@ -100,7 +100,7 @@ class ReinforcementLearner(BaseReinforcementLearningModel): """ # first, penalize if the action is not valid if not self._is_valid(action): - self.custom_info["Invalid"] += 1 + self.tensorboard_log("is_valid") return -2 pnl = self.get_unrealized_profit() @@ -109,15 +109,12 @@ class ReinforcementLearner(BaseReinforcementLearningModel): # reward agent for entering trades if (action == Actions.Long_enter.value and self._position == Positions.Neutral): - self.custom_info[f"{Actions.Long_enter.name}"] += 1 return 25 if (action == Actions.Short_enter.value and self._position == Positions.Neutral): - self.custom_info[f"{Actions.Short_enter.name}"] += 1 return 25 # discourage agent from not entering trades if action == Actions.Neutral.value and self._position == Positions.Neutral: - self.custom_info[f"{Actions.Neutral.name}"] += 1 return -1 max_trade_duration = self.rl_config.get('max_trade_duration_candles', 300) @@ -131,22 +128,18 @@ class ReinforcementLearner(BaseReinforcementLearningModel): # discourage sitting in position if (self._position in (Positions.Short, Positions.Long) and action == Actions.Neutral.value): - self.custom_info["Hold"] += 1 return -1 * trade_duration / max_trade_duration # close long if action == Actions.Long_exit.value and self._position == Positions.Long: if pnl > self.profit_aim * self.rr: factor *= self.rl_config['model_reward_parameters'].get('win_reward_factor', 2) - self.custom_info[f"{Actions.Long_exit.name}"] += 1 return float(pnl * factor) # close short if action == Actions.Short_exit.value and self._position == Positions.Short: if pnl > self.profit_aim * self.rr: factor *= self.rl_config['model_reward_parameters'].get('win_reward_factor', 2) - self.custom_info[f"{Actions.Short_exit.name}"] += 1 return float(pnl * factor) - self.custom_info["Unknown"] += 1 return 0.