-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathconvert_parquet.py
49 lines (40 loc) · 1.38 KB
/
convert_parquet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import json
import pandas as pd
def reformat_csv_to_openassitant(df: pd.DataFrame) -> pd.DataFrame:
"""
Reformat the downloaded CSV into either Instruction or Text format
so that it could be directly ingested into the training pipeline.
Parameters
----------
df: the downloaded panda dataframe
Return
------
DataFrame: reformatted dataframe
"""
new_df = pd.DataFrame()
new_df["INSTRUCTION"] = df["question_title"]
new_df["RESPONSE"] = df["content"]
new_df["SOURCE"] = "Zhihu"
new_df["METADATA"] = df.apply(
lambda x: json.dumps(
{
"question_id": x["question_id"],
"answer_id": x["answer_id"],
"author_id": x["author_id"],
"upvotes": x["upvotes"],
"answer_creation_time": x["answer_creation_time"],
},
ensure_ascii=False,
),
axis=1,
)
# Remove empty response rows
new_df = new_df[~(new_df["RESPONSE"] == " ") | (new_df["RESPONSE"].isna())]
return new_df
if __name__ == "__main__":
input_csv = "zhihu.csv"
# Create a pandas dataframe from your dataset file(s)
df = pd.read_csv(input_csv) # or any other way
df = reformat_csv_to_openassitant(df)
# Save the file in the Parquet format
df.to_parquet("dataset.parquet", row_group_size=100, engine="pyarrow", index=False)