-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathlyrics.py
79 lines (66 loc) · 2.02 KB
/
lyrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# import urllib
# import sys
# import subprocess
from bs4 import BeautifulSoup
from urlparse import urljoin
import requests
import time
import re
import lxml.html
from pymongo import MongoClient
base_url = "http://lyrics.wikia.com"
yeezys_page = "http://lyrics.wikia.com/wiki/Kanye_West"
test_url = "http://lyrics.wikia.com/wiki/Kanye_West:Big_Brother"
client = MongoClient()
db = client.kanye
def getTitle(url):
source_code = requests.get(url)
plain_text = source_code.text.encode("utf-8")
soupeezy = BeautifulSoup(plain_text, 'html.parser')
meta_title = soupeezy.title.string
meta_title = re.sub(r'(Kanye West)*.*\:', "", meta_title)
title = re.sub(r'( Lyrics - LyricWikia - Wikia)', "", meta_title)
title = re.sub(" ", "_", title)
return title.lower()
def getAlbum(url):
source_code = requests.get(url)
plain_text = source_code.text.encode("utf-8")
soupeezy = BeautifulSoup(plain_text, 'html.parser')
album_html = str(soupeezy.select('i > a'))
album_html = re.sub(r"(\w|\s|\W)*\"\s(title)\=\"(\w|\s|\W)*\"\>", "", album_html)
album = re.sub(r"\s\((\d){4}\)\<\/a\>\]", "", album_html)
album = re.sub(" ", "_", album)
return album.lower()
def getLyrics(url):
doc = lxml.html.parse(url)
kanyes = doc.getroot().cssselect(".lyricbox")[0]
lyrics = []
if kanyes.text is not None:
lyrics.append(kanyes.text)
for kanye in kanyes:
if str(kanye.tag).lower() == "br":
lyrics.append("\n")
if kanye.tail is not None:
lyrics.append(kanye.tail.encode('utf8'))
full = "".join(lyrics).strip()
tit = getTitle(url)
result = db.kanye.insert_one({
"title": tit,
"album": getAlbum(url),
"lyrics": str(full)
})
print tit
return full
def getAllLyrics():
lyrics = []
links = []
source_code = requests.get(yeezys_page)
plain_text = source_code.text.encode("utf-8")
soupeezy = BeautifulSoup(plain_text, 'html.parser')
for yeezy in soupeezy.select('ol > li > b > a'):
link = urljoin(base_url, yeezy['href'])
links.append(link)
for link in links:
lyrics.append(getLyrics(link))
time.sleep(1)
return lyrics[0]