-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathtournament_winners_vs_ufe.py
96 lines (78 loc) · 3.62 KB
/
tournament_winners_vs_ufe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# -*- coding: utf-8 -*-
import altair as alt
import pandas as pd
## path to repo with relevant data
## ( https://github.com/JeffSackmann/tennis_slam_pointbypoint )
data_prefix = '../tennis_slam_pointbypoint/'
year, tourney = '2021', 'wimbledon'
## load tournament data:
matches = pd.read_csv(data_prefix + year + '-' + tourney + '-matches.csv')
points = pd.read_csv(data_prefix + year + '-' + tourney + '-points.csv')
mpoints = pd.merge(points, matches).fillna(0)
mpoints.astype({'PointWinner': 'int',
'P1Winner': 'int',
'P2Winner': 'int',
'P1UnfErr': 'int',
'P2UnfErr': 'int'
}).dtypes
## skip points without a winner (placeholders when matches are starting, etc)
mpoints = mpoints.loc[mpoints['PointWinner'] != 0]
## get list of (women's) players who reached R3
womens_third = matches.loc[(matches['match_id'].str[-4] == '2') & (matches['match_id'].str[-3] == '3')]
r3_players = womens_third['player1'].tolist() + womens_third['player2'].tolist()
## get winner rate and unforced error rate for each player
player_rates = []
for player in r3_players:
ppoints = mpoints.loc[(mpoints['player2'] == player) | (mpoints['player1'] == player)]
winners_as_1 = ppoints.loc[(ppoints['P1Winner'] == 1) & (ppoints['player1'] == player)]
winners_as_2 = ppoints.loc[(ppoints['P2Winner'] == 1) & (ppoints['player2'] == player)]
errors_as_1 = ppoints.loc[(ppoints['P1UnfErr'] == 1) & (ppoints['player1'] == player)]
errors_as_2 = ppoints.loc[(ppoints['P2UnfErr'] == 1) & (ppoints['player2'] == player)]
n_points = len(ppoints)
n_winners = len(winners_as_1) + len(winners_as_2)
n_errors = len(errors_as_1) + len(errors_as_2)
winner_rate = n_winners / float(n_points)
error_rate = n_errors / float(n_points)
## is player still in the tournament? (did she win her last point?)
last_point = ppoints.tail(1)
still_in = 0.5
if last_point['player1'].item() == player and last_point['PointWinner'].item() == 1:
still_in = 1
elif last_point['player2'].item() == player and last_point['PointWinner'].item() == 2:
still_in = 1
player_row = [player, winner_rate, error_rate, still_in]
player_rates.append(player_row)
rates_header = ['Player', 'Winner Perc', 'Error Perc', 'still_in']
df = pd.DataFrame(player_rates, columns=rates_header)
## label only players remaining in the tournament (with last name only)
## (still_in is boolean variable manually added to the data input)
df['label'] = df.apply(lambda row: row.Player.split(' ')[-1] if row.still_in == 1 else '', axis=1)
points = alt.Chart(df).mark_circle(size=60).encode(
alt.X('Error Perc:Q',
scale=alt.Scale(domain=(0.1,0.24)),
axis=alt.Axis(format='%',
title='Unforced Error Rate (ufe per point)')
),
alt.Y('Winner Perc:Q',
scale=alt.Scale(domain=(0.05, 0.24)),
axis=alt.Axis(format='%',
title='Winner Rate (winners per point)')
),
## 'N' treats still_in as nominal, not a quantitative scale, so just picks
## two different colors
color=alt.Color('still_in:N', legend=None),
)
text = points.mark_text(
align='left',
baseline='middle',
dx=7
).encode(
text='label'
)
## strokeDash gives dotted line
reg_line = points.transform_regression('Error Perc', 'Winner Perc').mark_line(strokeDash=[1,1])
## the 'category' list specifies colors for the two options of 'still_in'
combined = (points + text + reg_line).configure_range(
category=['#00008B', '#B0E0E6'] ##{'scheme': 'dark2'}
)
combined.save('output/wimbledon_wta_winners_vs_ufe.html')