-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathread.py
62 lines (47 loc) · 2.18 KB
/
read.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import os
import pandas as pd
# 指定文件夹路径
folder_path = "your_folder_path" # 此为存放政策分词、分词后词频.txt的目录,应与本read.py处于目录中平级的位置
# 存储数据的列表
data = []
# 遍历文件夹及子文件夹中的txt文件
for root, dirs, files in os.walk(folder_path):
for file_name in files:
if file_name.endswith(".txt") and "_分词后_词频.txt" in file_name:
print("正在处理 " + file_name + ".........")
file_path = os.path.join(root, file_name)
# 提取文件名中的"*******"部分
file_id = file_name.split("_")[0]
# 读取文件的前5行数据
with open(file_path, 'r') as file:
lines = [line.strip().split() for line in file.readlines()[:5]]
# 提取数据并添加到列表中
for line in lines:
if len(line) == 2:
data.append([line[0], int(line[1]), file_id])
# 创建DataFrame
df = pd.DataFrame(data, columns=["高频词", "次数", "出处"])
# 打印DataFrame
print(df)
# 以下暂时不用
# # 创建一个字典,用于存储第三列的压缩结果
# compressed_data = {}
# # 遍历第三列,将相同出处的高频词和次数压缩成一格
# for idx, row in df.iterrows():
# key = row["出处"]
# if key in compressed_data:
# compressed_data[key]["高频词"].append(row["高频词"])
# compressed_data[key]["次数"].append(row["次数"])
# else:
# compressed_data[key] = {"高频词": [row["高频词"]], "次数": [row["次数"]]}
# # 重新构建DataFrame,将压缩后的结果写入
# compressed_df = pd.DataFrame(columns=["高频词", "次数", "出处"])
# for key, value in compressed_data.items():
# compressed_df = pd.concat([compressed_df, pd.DataFrame({"高频词": [", ".join(value["高频词"])], "次数": [sum(value["次数"])], "出处": [key]})], ignore_index=True)
# # 打印DataFrame
# print(compressed_df)
# output_file_path2 = "output2.xlsx"
# compressed_df.to_excel(output_file_path2)
# 保存为xlsx文件
output_file_path = "output.xlsx"
df.to_excel(output_file_path)