forked from bluegreengreen/7deadlysongs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path7deadlysongs.py
335 lines (250 loc) · 10.3 KB
/
7deadlysongs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
import math
import random
import spotipy
from spotipy.oauth2 import SpotifyOAuth
k = 7
passes = 100
# Attributes with associated weights - the higher, the more important (I think) they are
attributes = {"explicit": {"numerical": False, "weight": 0.5},
"year": {"numerical": True, "weight": 3},
"timeSig": {"numerical": False, "weight": 1},
"acoustic": {"numerical": True, "weight": 1},
"speech": {"numerical": True, "weight": 1.2},
"dance": {"numerical": True, "weight": 1.2},
"tempo": {"numerical": True, "weight": 1.4},
"instrumental": {"numerical": True, "weight": 1.7},
"energy": {"numerical": True, "weight": 1.7},
"valence": {"numerical": True, "weight": 2}
}
# Replace these empty strings with the appropriate values for your app!
clientID = ""
clientSecret = ""
redirectURI = "" # you can just use http://localhost/ if you need to
# Classes for holding relevant track data, and clusterings
class TrackData:
def __init__(self, dataPoint):
self.values = {
"artist": dataPoint.values["artist"],
"explicit": dataPoint.values["explicit"],
"year": dataPoint.values["year"],
"acoustic": dataPoint.values["acoustic"],
"dance": dataPoint.values["dance"],
"energy": dataPoint.values["energy"],
"instrumental": dataPoint.values["instrumental"],
"speech": dataPoint.values["speech"],
"tempo": dataPoint.values["tempo"],
"timeSig": dataPoint.values["timeSig"],
"valence": dataPoint.values["valence"]
}
class DataPoint:
def __init__(self, trackID, title, artist, explicit, year):
self.trackID = trackID
self.title = title
self.values = {
"artist": artist,
"explicit": explicit,
"year": int(year.split("-")[0])
}
def addFeatures(self, acoustic, dance, energy, instrumental, speech, tempo, timeSig, valence):
self.values.update([("acoustic", acoustic),
("dance", dance),
("energy", energy),
("instrumental", instrumental),
("speech", speech),
("tempo", tempo),
("timeSig", timeSig),
("valence", valence)])
class Cluster:
def __init__(self, centroid):
self.centroid = centroid
self.elements = []
def insert(self, element):
self.elements.append(element)
def orderByProximity(self):
tempPairs = []
for track in self.elements:
tempPairs.append((dissimilarity(track, self.centroid), track))
tempPairs.sort(key=lambda x: x[0])
self.elements = []
for pair in tempPairs:
self.elements.append(pair[1])
class Clustering:
def __init__(self):
self.clusters = []
def getInitialCentroids(self, pointList):
random.shuffle(pointList)
for i in range(0, k):
currentPoint = pointList.pop()
newCluster = Cluster(TrackData(currentPoint))
newCluster.insert(currentPoint)
self.clusters.append(newCluster)
# Normalises numerical attributes such that they have mean 0 and standard deviation 1
def normalise(points):
for attr in attributes.keys():
if attributes[attr]["numerical"]:
m, sd = getMeanSD(attr, points)
rescale(m, sd, attr, points)
def getMeanSD(attr, points):
mean = attributeMean(attr, points)
total = 0
for point in points:
total += (point.values[attr] - mean) ** 2
sd = math.sqrt(total / len(points))
return mean, sd
def rescale(mean, sd, attr, points):
for point in points:
point.values[attr] = (point.values[attr] - mean) / sd
def weightedDistance(point, centroid, attr, isNumerical, weight):
if isNumerical:
return
else:
return distCategorical(point.values[attr], centroid.values[attr]) * weight
# Calculates mixed Euclidean distance between a point and a centroid.
def dissimilarity(point, centroid):
sim = 0
for attr in attributes.keys():
if attributes[attr]["numerical"]:
sim += distNumerical(point.values[attr], centroid.values[attr]) * attributes[attr]["weight"]
else:
sim += distCategorical(point.values[attr], centroid.values[attr]) * attributes[attr]["weight"]
return math.sqrt(sim)
def distNumerical(v1, v2):
return (v1 - v2) ** 2
def distCategorical(v1, v2):
if v1 != v2:
return 1
else:
return 0
def attributeMean(attr, points):
total = 0
for point in points:
total += point.values[attr]
return total / len(points)
# Used for determining a centroid value for non-numerical attributes.
def attributeMajority(attr, points):
scores = {}
for point in points:
entry = point.values[attr]
if entry in scores:
scores[entry] += 1
else:
scores[entry] = 1
return max(scores, key=scores.get)
# Returns the index of the cluster whose centroid the track is most similar to.
def getCluster(point, clusters):
minSim = 9999 # arbitrarily high
minIndex = -1
for index, cluster in enumerate(clusters):
sim = dissimilarity(point, cluster.centroid)
if sim < minSim:
minSim = sim
minIndex = index
return minIndex
def adjustCentroid(cluster):
vals = cluster.centroid.values
points = cluster.elements
for attr in attributes.keys():
if attributes[attr]["numerical"]:
vals[attr] = attributeMean(attr, points)
else:
vals[attr] = attributeMajority(attr, points)
def assignment(clustering):
clusters = clustering.clusters
for cluster in clusters:
points = cluster.elements
for i in range(0, len(points)):
point = points.pop()
clusters[getCluster(point, clusters)].insert(point)
def refitting(clustering):
for cluster in clustering.clusters:
adjustCentroid(cluster)
def kMeans(clustering, iterations):
for i in range(0, iterations):
assignment(clustering)
refitting(clustering)
# Picks the most central element from each cluster, (ideally) with a distinct artist.
def pickBest(clustering):
reps = []
artists = []
found = False
for cluster in clustering.clusters:
cluster.orderByProximity()
for track in cluster.elements:
if track.values["artist"] not in artists:
reps.append(track.trackID)
artists.append(track.values["artist"])
found = True
break
if not found: # couldn't find a distinct artist
track = cluster.elements[0]
reps.append(track.trackID)
artists.append(track.values["artist"])
found = False
return reps
# Prints every element of each cluster
def printClustering(clustering):
print("YOUR CLUSTERS:\n")
for count, item in enumerate(clustering.clusters):
print("CLUSTER", count)
for track in item.elements:
print(track.values["artist"] + ": " + track.title)
print("")
def main():
IDs = []
initialPoints = []
print("Connecting to Spotify...")
scope = "playlist-read-private playlist-modify-private"
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope,
client_id=clientID,
client_secret=clientSecret,
redirect_uri=redirectURI))
playlist = input("Paste the URL of your playlist: ")
try:
results = sp.playlist_items(playlist)
except spotipy.exceptions.SpotifyException:
print("ERROR: Couldn't retrieve playlist. Make sure the URL is correct!")
return
# combine all results
allItems = results["items"]
while results["next"]:
results = sp.next(results)
allItems.extend(results["items"])
for item in allItems:
track = item["track"]
IDs.append(track["id"])
initialPoints.append(DataPoint(track["id"], track["name"], track["artists"][0]["name"], track["explicit"],
track["album"]["release_date"]))
# audio features can only be acquired in batches of 100
audioFeatures = []
start = 0
end = 100
while start < len(IDs):
IDsSlice = IDs[start:end]
start += 100
end += 100
audioFeatures = audioFeatures + sp.audio_features(IDsSlice)
for count, item in enumerate(audioFeatures):
initialPoints[count].addFeatures(item["acousticness"], item["danceability"], item["energy"],
item["instrumentalness"], item["speechiness"], item["tempo"],
str(item["time_signature"]), item["valence"])
print("Processing your data...")
normalise(initialPoints)
# Generating the initial configuration
clustering = Clustering()
clustering.getInitialCentroids(initialPoints)
clusters = clustering.clusters
for i in range(0, len(initialPoints)):
point = initialPoints.pop()
clusters[getCluster(point, clusters)].insert(point)
refitting(clustering)
# Main processing
kMeans(clustering, passes)
topPicks = pickBest(clustering)
newList = sp.user_playlist_create(sp.current_user()["id"], "7 deadly songs!", False, False,
"A nice little selection of 7 songs to share with your friends.")
sp.playlist_add_items(newList["id"], topPicks)
print("Done! You can check out your 7 deadly songs here: " + newList["external_urls"]["spotify"])
# Un-comment the line below if you want to see all of the tracks in each clustering
# printClustering(clustering)
if __name__ == "__main__":
main()