-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpunctuation.py
51 lines (40 loc) · 1.25 KB
/
punctuation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import re
with open('Corpus.txt', 'r') as content_file:
content = content_file.read()
replacement_patterns = [
# "()" => " () "
(r"\)", ") "),
(r"\(", " ("),
# "[]" => " [] "
(r"\]", '] '),
(r"\[", ' ['),
(r"[ ]{2,}", ' '), # space more than one
(r"\n\n", '\n'), # omit repeated new lines!
# " , " or "," => ", "
(r" ، ", ', '),
(r" ،", ', '),
(r"\s\.\s", '. '),
(r"\s\.(?!\.) ", '. '), # not .. or ... just .
(r"\s\.\.\.", '... '), # omit space if space exist before ...
# ( ) => ()
(r"\s\)", ')'),
(r"\(\s", '('),
# [ ] => []
(r"\s\]", ']'),
(r"\[\s", '['),
(r"\s:", ':'),# omit space if space exist before :
(r":(?!\s)", ': ') # add space if space exist before ...
]
class RegexpReplacer(object):
def __init__(self, patterns=replacement_patterns):
# Fixed this line - "patterns", not "pattern"
self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
def replace(self, text):
s = text
for (pattern, repl) in self.patterns:
(s, count) = re.subn(pattern, repl, s)
return s
rep=RegexpReplacer()
content = rep.replace(content)
with open('new.txt', 'w') as the_file:
the_file.write(content)