-
Notifications
You must be signed in to change notification settings - Fork 67
/
Copy pathmatrix.py
39 lines (31 loc) · 968 Bytes
/
matrix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# coding:utf-8
import jieba
import pandas as pd
import codecs
import string
import re
# 清洗文本
def clearTxt(line:str):
if(line != ''):
line = line.strip()
# 去除文本中的英文和数字
line = re.sub("[a-zA-Z0-9]", "", line)
# 去除文本中的中文符号和英文符号
line = re.sub("[\s+\.\!\/_,$%^*(+\"\';:“”.]+|[+——!,。??、~@#¥%……&*()]+", "", line)
return line
return None
#文本切割
def sent2word(line):
segList = jieba.cut(line,cut_all=False)
segSentence = ''
for word in segList:
if word != '\t':
segSentence += word + " "
return segSentence.strip()
if __name__ == '__main__':
df = pd.read_csv('data/article.csv')
target = codecs.open('data/cut.txt', 'w', encoding='utf-8')
for i in df['text']:
line = clearTxt(i)
seg_line = sent2word(line)
target.writelines(seg_line + '\n')