Skip to content

Commit

Permalink
u
Browse files Browse the repository at this point in the history
  • Loading branch information
Jiayi-Pan committed Jan 23, 2025
1 parent 37670e9 commit 54621a0
Showing 1 changed file with 5 additions and 3 deletions.
8 changes: 5 additions & 3 deletions verl/utils/dataset/rl_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,11 @@ def _read_files_and_tokenize(self):
# filter out too long prompts
tokenizer = self.tokenizer
prompt_key = self.prompt_key
self.dataframe = self.dataframe[self.dataframe.apply(lambda doc: len(
tokenizer.apply_chat_template(doc[prompt_key], add_generation_prompt=True)) <= self.max_prompt_length,
axis=1)]

# nvm if prompt is too long
# self.dataframe = self.dataframe[self.dataframe.apply(lambda doc: len(
# tokenizer.apply_chat_template(doc[prompt_key], add_generation_prompt=True)) <= self.max_prompt_length,
# axis=1)]

print(f'filter dataset len: {len(self.dataframe)}')

Expand Down

0 comments on commit 54621a0

Please sign in to comment.