Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
cenkai88 committed Sep 24, 2015
1 parent de73b0f commit 96f1f74
Showing 1 changed file with 21 additions and 27 deletions.
48 changes: 21 additions & 27 deletions cernet/stat.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,16 +45,15 @@ def cleanse(record, ipdict, parser, air):
if not k:
keyword = 'unknown'
t = 'unknown'

return '%s|%s|%s|%s|%s|%s|%s'%(month, day, hour, school, host, t, keyword)

def stat(x, features, f):
def stat(x, prefix, features, f):
if len(features) == 1:
fea = f[features[0]]
result = x.map(lambda x: (x.split('|')[fea], 1)).reduceByKey(add).collect()
else:
result = stat1(x, features, f).collect()
filename = 'result/[%s]%s.csv'%('_'.join(features))
filename = 'result/[%s]%s.csv'%(prefix, '_'.join(features))
output(result, filename, len(features))

def output(result, filename, n):
Expand Down Expand Up @@ -109,10 +108,6 @@ def main(sc, path):
for line in ipdicts.readlines():
ipdict[line.split('|')[0]] = line.split('|')[1].strip().decode('utf-8', 'ignore')
cernet = sc.textFile(path)
cernet2 = cernet.map(lambda x : cleanse(x, ipdict, parser, air)).filter(lambda x : x!=None).cache()
cernet_trip = cernet.filter(lambda x: x.split('|')[5]==u'旅游').cache()
cernet_shop = cernet.filter(lambda x: x.split('|')[5]==u'购物').cache()
cernet_search = cernet.filter(lambda x: x.split('|')[5]==u'搜索').cache()
parser = [{'regex': '^tieba\.baidu\.com\/+[\w\W]+kw=', 'type': u'搜索', 'dest' : lambda x: urllib.unquote(x.split('kw=')[1].split('&')[0]).encode('raw_unicode_escape').decode('utf-8', 'ignore')},\
{'regex': '^m\.tieba\.com\/+[\w\W]+word=', 'type': u'搜索', 'dest' : lambda x: urllib.unquote(x.split('word=')[1].split('&')[0]).encode('raw_unicode_escape').decode('utf-8', 'ignore')}, \
{'regex': '^(www\.b|b)aidu\.com\/+[\w\W]+wd=', 'type': u'搜索', 'dest' : lambda x: urllib.unquote(x.split('wd=')[1].split('&')[0]).encode('raw_unicode_escape').decode('utf-8', 'ignore')}, \
Expand Down Expand Up @@ -175,29 +170,28 @@ def main(sc, path):

for i in xrange(len(parser)):
parser[i]['regex'] = re.compile(parser[i]['regex'])
#一维度统计
cernet2 = cernet.map(lambda x : cleanse(x, ipdict, parser, air)).filter(lambda x : x!=None).cache()
cernet_trip = cernet2.filter(lambda x: x.split('|')[5]==u'旅游').cache()
cernet_shop = cernet2.filter(lambda x: x.split('|')[5]==u'购物').cache()
cernet_search = cernet2.filter(lambda x: x.split('|')[5]==u'搜索').cache()
for dim in f:
stat(cernet2, [dim], f)

#二维度
stat(cernet2, ['month', 'hour'], f)
stat(cernet2, ['day', 'hour'], f)
stat(cernet_trip, ['keyword', 'month'], f)
stat(cernet_trip, ['keyword', 'school'], f)
stat(cernet_shop, ['keyword', 'hour'], f)
stat(cernet_shop, ['keyword', 'month'], f)
stat(cernet_shop, ['host', 'hour'], f)
stat(cernet_shop, ['host', 'month'], f)
stat(cernet_shop, ['host', 'day'], f)
stat(cernet_shop, ['host', 'school'], f)

#分类的keyword统计
stat(cernet2, 'total', [dim], f)
stat(cernet2, 'total', ['month', 'hour'], f)
stat(cernet2, 'total', ['day', 'hour'], f)
stat(cernet_trip, 'trip', ['keyword', 'month'], f)
stat(cernet_trip, 'trip', ['keyword', 'school'], f)
stat(cernet_shop, 'shop', ['keyword', 'hour'], f)
stat(cernet_shop, 'shop', ['keyword', 'month'], f)
stat(cernet_shop, 'shop', ['host', 'hour'], f)
stat(cernet_shop, 'shop', ['host', 'month'], f)
stat(cernet_shop, 'shop', ['host', 'day'], f)
stat(cernet_shop, 'shop', ['host', 'school'], f)
for dim in f:
if dim!='type':
stat(cernet_shop, [dim], f)
stat(cernet_trip, [dim], f)
stat(cernet_search, [dim], f)
stat(cernet_shop, 'shop', [dim], f)
stat(cernet_trip, 'trip', [dim], f)
stat(cernet_search, 'search', [dim], f)

if __name__ == "__main__":
path = sys.argv[1]
conf = SparkConf().setAppName(APP_NAME)
Expand Down

0 comments on commit 96f1f74

Please sign in to comment.