-
Notifications
You must be signed in to change notification settings - Fork 419
/
Copy pathwhitespace.py
226 lines (200 loc) · 5.79 KB
/
whitespace.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
#!/usr/bin/env python
# Inspired vaguely by http://compsoc.dur.ac.uk/whitespace/
# Not a language, just an encoding
r"""
This defines an encoding, 'whitespace', so::
>>> 'test'.encode('whitespace')
' \t\n \t\t \t\t\t\t\n \t\n \t \t\n \t\t'
>>> _.decode('whitespace')
'test'
>>> 'asfdasdfasdf'.decode('whitespace', 'replace')
'??'
You can insert non-whitespace anywhere, and it doesn't matter::
>>> 'blah \t\nx \ty\t \tz\t\t\t\n \t\n \t \t\n \t\tblah'.decode('whitespace')
'test'
"""
from cStringIO import StringIO
import sys
import re
import codecs
def catcher(default=None, printit=False):
def decorator(func):
def replacement(*args, **kw):
if printit:
print 'Call %s %s %s' % (
func.func_name, args, kw)
try:
value = func(*args, **kw)
if printit:
print 'Returned %r' % (value,)
return value
except Exception, e:
print 'Got exception in %s: %s' % (func, e)
return default
return replacement
return decorator
numbers = [' ', ' \t', ' \n', '\t ', '\t\t', '\t\n', '\n ', '\n\t']
char_numbers = {}
whitespace_numbers = {}
for i, n in enumerate(numbers):
char_numbers[str(i)] = n
whitespace_numbers[n] = i
def enc_char(c):
"""
Takes a single character and returns it as a six-character
set of whitespace.
"""
octal = oct(ord(c))
if len(octal) == 1:
return ' '
elif len(octal) == 2:
return ' ' + char_numbers[octal[1]]
elif len(octal) == 3:
return ' ' + char_numbers[octal[1]] + char_numbers[octal[2]]
else:
return (char_numbers[octal[1]] + char_numbers[octal[2]]
+ char_numbers[octal[3]])
def dec_triplet(c):
try:
code = (whitespace_numbers[c[0:2]]*64
+ whitespace_numbers[c[2:4]]*8
+ whitespace_numbers[c[4:6]])
if code > 255:
raise ValueError(
"Bad whitespace triplet: %r" % c)
return chr(code)
except KeyError:
raise ValueError, "Bad whitespace triplet: %r" % c
def enc_stream(input, output):
while 1:
s = input.read(4096)
if not s:
break
for c in s:
output.write(enc_char(c))
@catcher(('', 0))
def codec_encode(s, errors='strict'):
result = []
for c in s:
#if c not in ' \t\n':
# result.append(c)
result.append(enc_char(c))
return (''.join(result), len(s))
def dec_stream(input, output):
s = ''
while 1:
data = input.read(36)
if not data:
break
s += data
s = re.sub(r'[^ \t\n]', '', s)
while len(s) > 6:
output.write(dec_triplet(s[:6]))
s = s[6:]
_whitespace_re = re.compile(r'^[ \t\n]*$')
_not_whitespace_re = re.compile(r'[^ \t\n]')
_encoding_comment_re = re.compile(r'#[^\n]*coding:\s*[a-z][a-z0-9_-]+[^\n]*\n', re.I)
#@catcher((0, ''))
def codec_decode(s, errors='strict', extra=None):
if extra is not None:
fp = s
s = errors
errors = extra
errors = 'strict'
result = []
extra_used = 0
match = _encoding_comment_re.match(s)
if match:
s = s[match.end():]
extra_used += match.end()
if (errors == 'ignore' or errors == 'strict'
and not _whitespace_re.match(s)):
last_good = ''
next_good = ''
last_bad = 0
next_bad = 0
for c in s:
if c in ' \t\n':
next_good += c
else:
next_bad += 1
if len(next_good) == 6:
last_good += next_good
last_bad += next_bad
next_good, next_bad = '', 0
extra_used = last_bad
s = last_good
length = len(s)
for i in xrange(0, length/6):
try:
result.append(dec_triplet(s[i*6:i*6+6]))
except ValueError:
if errors == 'replace':
result.append('?')
elif errors == 'ignore':
pass
else:
raise
used = length - (length % 6) + extra_used
return (''.join(result), used)
### Codec APIs
class StreamWriter(codecs.StreamWriter):
encode = codec_encode
class StreamReader(codecs.StreamReader):
decode = codec_decode
def find_codec(codec_name):
if codec_name.lower() == 'whitespace':
return (codec_encode, codec_decode, StreamReader, StreamWriter)
return None
codecs.register(find_codec)
### Command-line usage
def main(args=None):
if args is None:
args = sys.argv[1:]
import optparse
parser = optparse.OptionParser(usage="%prog [OPTIONS]")
parser.add_option(
'-r', '--repr',
action="store_true",
dest="repr",
help="Use repr() on output")
parser.add_option(
'-e', '--encode',
action="store_true",
dest="encode",
help="Encode input")
parser.add_option(
'-d', '--decode',
action="store_true",
dest="decode",
help="Decode input")
parser.add_option(
'--doctest',
action="store_true",
dest="doctest",
help="Run doctests")
options, args = parser.parse_args()
if args:
parser.print_help()
return
if options.doctest:
import doctest
doctest.testmod()
return
if options.encode:
streamer = enc_stream
elif options.decode:
streamer = dec_stream
else:
print 'You must give -e or -d'
parser.print_help()
return
if options.repr:
output = StringIO()
else:
output = sys.stdout
streamer(sys.stdin, output)
if options.repr:
print repr(output.getvalue())
if __name__ == '__main__':
main()