diff --git a/src/LLM/OllamaChat.py b/src/LLM/OllamaChat.py index 33b052f..6d678fe 100644 --- a/src/LLM/OllamaChat.py +++ b/src/LLM/OllamaChat.py @@ -110,6 +110,8 @@ def generate_simple_response(self, Generate a simple response without historic. Args: + prompt (str): user prompt + sys_prompt (str, optional): system prompt stream (bool, optional): Stream response in real-time additional_options (dict, optional): Temporary generation options diff --git a/src/PolicyTrainer/CustomRewardWrapper.py b/src/PolicyTrainer/CustomRewardWrapper.py index 350d0c3..e4b06d7 100644 --- a/src/PolicyTrainer/CustomRewardWrapper.py +++ b/src/PolicyTrainer/CustomRewardWrapper.py @@ -4,12 +4,27 @@ class CustomRewardWrapper(gym.Wrapper): - def __init__(self, env, success_func: Callable = None, llm_reward_function: Callable = None): + def __init__(self, env: gym.Env, success_func: Callable = None, llm_reward_function: Callable = None): + """init the custom reward wrapper + + Args: + env (gym.Env): the current environment + success_func (Callable, optional): this function should return True if success. Defaults to None. + llm_reward_function (Callable, optional): the generated reward function. Defaults to None. + """ super().__init__(env) self.success_func = success_func self.llm_reward_function = llm_reward_function def step(self, action): + """override the step function to integrate our reward function + + Args: + action (): the action to realise + + Returns: + tuple: observation, reward, terminated, truncated, info + """ observation, original_reward, terminated, truncated, info = self.env.step( action ) diff --git a/src/PolicyTrainer/PolicyTrainer.py b/src/PolicyTrainer/PolicyTrainer.py index a95b50e..521a379 100644 --- a/src/PolicyTrainer/PolicyTrainer.py +++ b/src/PolicyTrainer/PolicyTrainer.py @@ -12,6 +12,8 @@ from time import sleep from queue import Empty +from stable_baselines3.common.vec_env.base_vec_env import VecEnv + from PolicyTrainer.CustomRewardWrapper import CustomRewardWrapper from PolicyTrainer.TrainingInfoCallback import TrainingInfoCallback from State.State import State @@ -24,6 +26,13 @@ class PolicyTrainer: def __init__(self, memory: list[State], env_type: EnvType, timeout: int): + """initialise the policy trainer + + Args: + memory (list[State]): + env_type (EnvType): parameter of the env + timeout (int): for the model.learn() + """ self.logger = getLogger("VIRAL") self.memory = memory self.timeout = timeout @@ -48,7 +57,12 @@ def __init__(self, memory: list[State], env_type: EnvType, timeout: int): self._learning(self.memory[0]) def _learning(self, state: State, queue: Queue = None) -> None: - """train a policy on an environment""" + """train a policy on an environment + + Args: + state (State): + queue (Queue, optional): handle modification to return. Defaults to None. + """ self.logger.debug( f"state {state.idx} begin is learning with reward function: {state.reward_func_str}" ) @@ -71,7 +85,7 @@ def _learning(self, state: State, queue: Queue = None) -> None: ) def evaluate_policy(self, idx1: int, idx2: int) -> int: - """ + """ TODO to be change, i think evaluate if the policy give is better than the original Evaluate policy performance for multiple reward functions Args: @@ -134,11 +148,22 @@ def evaluate_policy(self, idx1: int, idx2: int) -> int: def test_policy( self, - env, + env: VecEnv, policy, - numvenv, - nb_episodes=10, + numvenv: int, + nb_episodes: int = 100, ) -> float: + """test a policy already train + + Args: + env (VecEnv): envs + policy (): can be PPO or other RLAlgo + numvenv (int): number of env in the vec + nb_episodes (int, optional): . Defaults to 100. + + Returns: + float: _description_ + """ all_rewards = [] nb_success = 0 @@ -163,7 +188,13 @@ def test_policy( success_rate = nb_success / nb_episodes return success_rate - def test_policy_hf(self, policy_path, nb_episodes = 100): + def test_policy_hf(self, policy_path: str, nb_episodes: int = 100): + """visualise a policy + + Args: + policy_path (str): the path of the policy to load + nb_episodes (int, optional): . Defaults to 100. + """ env = make(self.env_name, render_mode='human') if self.algo == Algo.PPO: policy = PPO.load(policy_path) @@ -175,9 +206,17 @@ def test_policy_hf(self, policy_path, nb_episodes = 100): obs, _, term, trunc, _ = env.step(actions) done = term or trunc - def _generate_env_model(self, reward_func): - """ - Generate the environment model + def _generate_env_model(self, reward_func) -> tuple[VecEnv, PPO, int]: + """Generate the environment model + + Args: + reward_func (Callable): the generated reward function + + Raises: + ValueError: if algo not implemented + + Returns: + tuple[VecEnv, PPO, int]: the envs, the model, the number of envs """ numenvs = 2 # SubprocVecEnv sauf on utilisera cuda derrière diff --git a/src/PolicyTrainer/TrainingInfoCallback.py b/src/PolicyTrainer/TrainingInfoCallback.py index 59676a5..c36d0c5 100644 --- a/src/PolicyTrainer/TrainingInfoCallback.py +++ b/src/PolicyTrainer/TrainingInfoCallback.py @@ -15,13 +15,13 @@ def __init__(self): self.num_envs = None def _on_training_start(self): - """Initialisation au début de l'entraînement""" + """Init at the begin of the training""" self.num_envs = self.training_env.num_envs self.current_episode_rewards = np.zeros(self.num_envs) self.current_episode_lengths = np.zeros(self.num_envs, dtype=int) def _on_step(self) -> bool: - """Méthode appelée à chaque étape de l'entraînement.""" + """call every steps""" rewards = self.locals["rewards"] dones = self.locals["dones"] @@ -43,7 +43,7 @@ def _on_step(self) -> bool: return True def _on_training_end(self) -> None: - """Méthode appelée à la fin de l'entraînement.""" + """call at the end of the training""" rewards = self.training_metrics["episode_rewards"] rewards /= np.linalg.norm(rewards) lengths = self.training_metrics["episode_lengths"] @@ -55,8 +55,9 @@ def _on_training_end(self) -> None: } def get_metrics(self): - """ - Méthode pour récupérer les métriques calculées. - :return: Dictionnaire des métriques + """for get metrics + + Returns: + dict: contain metrics harvested """ return self.custom_metrics diff --git a/src/State/State.py b/src/State/State.py index 7a1f314..344633d 100644 --- a/src/State/State.py +++ b/src/State/State.py @@ -5,65 +5,72 @@ logger = getLogger("VIRAL") class State: + """Represents a state in the reward function generation and evaluation process. + + This class encapsulates the key components of a reward function's lifecycle, + tracking its index, implementation, policy, and performance metrics. + + Attributes: + idx (int): Unique identifier for the state. + reward_func (Callable, optional): The compiled reward function. + reward_func_str (str, optional): String representation of the reward function. + policy (object, optional): The policy associated with the reward function. + performances (dict, optional): Performance metrics of the reward function. + + Key Characteristics: + - Tracks the evolution of reward functions + - Provides a snapshot of a specific iteration + - Allows for dynamic updating of policy and performance + + Initialization Constraints: + - Initial state (idx=0) cannot have a reward function + - Non-initial states must have both a reward function and its string representation + + Methods: + - set_policy(policy): Update the associated policy + - set_performances(performances): Update performance metrics + - __repr__(): Provide a human-readable string representation of the state + + Example: + # Creating a new state for a reward function + state = State( + idx=1, + reward_func=my_reward_func, + reward_func_str="def reward_func(...):", + policy=None, + perfomances=None + ) + + # Updating state with training results + state.set_policy(trained_policy) + state.set_performances({ + 'success_rate': 0.75, + 'average_reward': 10.5 + }) + + Notes: + - Designed for tracking reward function iterations + - Provides flexibility in managing function states + - Supports logging and debugging of reward function generation process + """ def __init__( self, - idx, + idx: int, reward_func: Callable = None, reward_func_str: str = None, policy=None, perfomances: dict = None, ): - """ - Represents a state in the reward function generation and evaluation process. - - This class encapsulates the key components of a reward function's lifecycle, - tracking its index, implementation, policy, and performance metrics. - - Attributes: - idx (int): Unique identifier for the state. - reward_func (Callable, optional): The compiled reward function. - reward_func_str (str, optional): String representation of the reward function. - policy (object, optional): The policy associated with the reward function. - performances (dict, optional): Performance metrics of the reward function. - - Key Characteristics: - - Tracks the evolution of reward functions - - Provides a snapshot of a specific iteration - - Allows for dynamic updating of policy and performance - - Initialization Constraints: - - Initial state (idx=0) cannot have a reward function - - Non-initial states must have both a reward function and its string representation - - Methods: - - set_policy(policy): Update the associated policy - - set_performances(performances): Update performance metrics - - __repr__(): Provide a human-readable string representation of the state - - Example: - # Creating a new state for a reward function - state = State( - idx=1, - reward_func=my_reward_func, - reward_func_str="def reward_func(...):", - policy=None, - perfomances=None - ) - - # Updating state with training results - state.set_policy(trained_policy) - state.set_performances({ - 'success_rate': 0.75, - 'average_reward': 10.5 - }) - - Notes: - - Designed for tracking reward function iterations - - Provides flexibility in managing function states - - Supports logging and debugging of reward function generation process + """init a new state + + Args: + idx (int): the index of the memory + reward_func (Callable, optional): . Defaults to None. + reward_func_str (str, optional): for printing the reward function. Defaults to None. + policy (_type_, optional): . Defaults to None. + perfomances (dict, optional): . Defaults to None. """ self.idx = idx - self.src: list = [self.idx] if self.idx == 0 and (reward_func is not None or reward_func_str is not None): logger.error("the inital state don't take reward function") elif self.idx != 0 and (reward_func is None or reward_func_str is None): @@ -74,14 +81,20 @@ def __init__( self.logger_csv = getLoggerCSV() self.performances = perfomances - def set_src(self, state): - self.src = state.src.copy() - self.src.append(self.idx) - def set_policy(self, policy): + """set the current policy + + Args: + policy (): + """ self.policy = policy def set_performances(self, performances: dict): + """set performance after the test + + Args: + performances (dict): + """ self.performances = performances self.logger_csv.to_csv(self) diff --git a/src/VIRAL.py b/src/VIRAL.py index 0cfbf26..d6c40c5 100644 --- a/src/VIRAL.py +++ b/src/VIRAL.py @@ -21,8 +21,11 @@ def __init__( """ Initialize VIRAL architecture for dynamic reward function generation Args: + env_type (EnvType): refer to parameter of an gym Env model (str): Language model for reward generation - learning_method (str): Reinforcement learning method + hf (bool, optional): active the human feedback + training_time (int, optional): timeout for model.learn() + options (dict, optional): options for the llm """ if options.get("seed") is None: options["seed"] = random.randint(0, 1000000) @@ -51,6 +54,11 @@ def __init__( ) def generate_context(self, prompt_info: dict): + """Generate more contexte for Step back prompting + + Args: + prompt_info (dict): contain a task, and observation space + """ prompt = f"{prompt_info}\nDescribe which observation can achive the goal." sys_prompt = ( f"You're a physics expert, specializing in {self.env_type} motion analysis.\n" @@ -207,7 +215,16 @@ def self_refine_reward(self, idx: int) -> Callable: return len(self.memory) - 1 - def human_feedback(self, prompt: str, idx: int) -> Callable: + def human_feedback(self, prompt: str, idx: int) -> str: + """implement human feedback + + Args: + prompt (str): user prompt + idx (int): state.idx to refine + + Returns: + str: return the modified prompt + """ self.logger.info(self.memory[idx]) visualise = input("do you need to visualise policy ?\ny/n:") if visualise.lower() in ["y", "yes"]: diff --git a/src/log/LoggerCSV.py b/src/log/LoggerCSV.py index 26c2ffe..16ad8c6 100644 --- a/src/log/LoggerCSV.py +++ b/src/log/LoggerCSV.py @@ -5,21 +5,50 @@ def getLoggerCSV(): + """Retrieve the instance of the CSV logger. + + Raises: + NotImplementedError: If the logger instance has not been created. + """ if LoggerCSV._instance is None: raise NotImplementedError("need to instance the logger, before get it") return LoggerCSV._instance class LoggerCSV: + """Logger for recording performance metrics in a CSV file. + + Attributes: + env_type (EnvType): The type of environment. + llm (str): The language model used. + csv_file (str): Path to the CSV file for logging. + logger (Logger): Instance of the logger. + _initialized (bool): Indicates whether the logger has been initialized. + """ _instance = None # singleton def __new__(cls, env_type: EnvType, llm: str): + """Create a new instance of LoggerCSV. + + Args: + env_type (EnvType): The type of environment. + llm (str): The language model used. + + Returns: + LoggerCSV: The singleton instance of LoggerCSV. + """ if cls._instance is None: cls._instance = super(LoggerCSV, cls).__new__(cls) cls._instance._initialized = False return cls._instance def __init__(self, env_type: EnvType, llm: str): + """Initialize the LoggerCSV instance. + + Args: + env_type (EnvType): The type of environment. + llm (str): The language model used. + """ if self._initialized: return self.env_type = env_type @@ -29,6 +58,14 @@ def __init__(self, env_type: EnvType, llm: str): self._initialized = True def to_csv(self, state): + """Write performance metrics to the CSV file. + + Args: + state: The state object containing performance metrics. + + Raises: + ValueError: If the state is not completed. + """ if state.performances is None: self.logger.debug(f"State {state.idx} is not completed") raise ValueError(f"State {state.idx} is not completed") diff --git a/src/utils/utils.py b/src/utils/utils.py index a586e9d..1780579 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -1,7 +1,5 @@ def unwrap_env(env): - """ - Fonction récursive pour déballer les wrappers Gym jusqu'à l'environnement de base. - """ + """usefull fonction for retrive private env attributes""" while hasattr(env, "env"): # check if env is a wrapper env = unwrap_env(env.env) return env \ No newline at end of file