-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrating.py
110 lines (82 loc) · 3.57 KB
/
rating.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from pdb import set_trace as T
from collections import defaultdict
import numpy as np
import openskill
def rank(policy_ids, scores):
'''Compute policy rankings from per-agent scores'''
agents = defaultdict(list)
for policy_id, score in zip(policy_ids, scores):
agents[policy_id].append(score)
# Double argsort returns ranks
return np.argsort(np.argsort(
[-np.mean(vals) + 1e-8 * np.random.normal() for policy, vals in
sorted(agents.items())])).tolist()
class OpenSkillRating:
'''OpenSkill Rating wrapper for estimating relative policy skill
Provides a simple method for updating skill estimates from raw
per-agent scores as are typically returned by the environment.'''
def __init__(self, mu, anchor_mu, sigma, agents=[], anchor=None):
'''
Args:
agents: List of agent classes to rank
anchor: Baseline policy name to anchor to mu
mu: Anchor point for the baseline policy (cannot be exactly 0)
sigma: 68/95/99.7 win rate against 1/2/3 sigma lower SR'''
if __debug__:
err = 'Agents must be ordered (e.g. list, not set)'
assert type(agents) != set, err
self.ratings = {}
self.mu = mu
self.anchor_mu = anchor_mu
self.sigma = sigma
for e in agents:
self.add_policy(e)
self.anchor = anchor
self._anchor_baseline()
def __str__(self):
return ', '.join(f'{p}: {int(r.mu)}' for p, r in self.ratings.items())
@property
def stats(self):
return {p: int(r.mu) for p, r in self.ratings.items()}
def _anchor_baseline(self):
'''Resets the anchor point policy to mu SR'''
for agent, rating in self.ratings.items():
rating.sigma = self.sigma
if agent == self.anchor:
rating.mu = self.anchor_mu
rating.sigma = self.sigma
def set_anchor(self, name):
if self.anchor is not None:
self.remove_policy(self.anchor)
self.add_policy(name)
self.anchor = name
self._anchor_baseline()
def add_policy(self, name):
assert name not in self.ratings, f'Policy {name} already added to ratings'
self.ratings[name] = openskill.Rating(mu=self.mu, sigma=self.sigma)
def remove_policy(self, name):
assert name in self.ratings, f'Policy {name} not in ratings'
del self.ratings[name]
def update(self, policy_ids, ranks=None, scores=None):
'''Updates internal skill rating estimates for each policy
You should call this function once per simulated environment
Provide either ranks OR policy_ids and scores
Args:
ranks: List of ranks in the same order as agents
policy_ids: List of policy IDs for each agent episode
scores: List of scores for each agent episode
Returns:
Dictionary of ratings keyed by agent names'''
if __debug__:
err = 'Specify either ranks or scores'
assert (ranks is None) != (scores is None), err
assert self.anchor is not None, 'Set the anchor policy before updating ratings'
if ranks is None:
ranks = rank(policy_ids, scores)
teams = [[self.ratings[e]] for e in policy_ids]
ratings = openskill.rate(teams, rank=ranks)
ratings = [openskill.create_rating(team[0]) for team in ratings]
for agent, rating in zip(policy_ids, ratings):
self.ratings[agent] = rating
self._anchor_baseline()
return self.ratings