-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdebug.txt
117 lines (116 loc) · 9.97 KB
/
debug.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
| Task |Version| Metric |Value | |Stderr|
|-------------------------------------------------|------:|--------|-----:|---|-----:|
|hendrycksTest-abstract_algebra | 1|acc |0.2300|± |0.0423|
| | |acc_norm|0.2300|± |0.0423|
|hendrycksTest-anatomy | 1|acc |0.2519|± |0.0375|
| | |acc_norm|0.2519|± |0.0375|
|hendrycksTest-astronomy | 1|acc |0.2697|± |0.0361|
| | |acc_norm|0.2697|± |0.0361|
|hendrycksTest-business_ethics | 1|acc |0.1600|± |0.0368|
| | |acc_norm|0.1600|± |0.0368|
|hendrycksTest-clinical_knowledge | 1|acc |0.2943|± |0.0280|
| | |acc_norm|0.2943|± |0.0280|
|hendrycksTest-college_biology | 1|acc |0.2917|± |0.0380|
| | |acc_norm|0.2917|± |0.0380|
|hendrycksTest-college_chemistry | 1|acc |0.3300|± |0.0473|
| | |acc_norm|0.3300|± |0.0473|
|hendrycksTest-college_computer_science | 1|acc |0.3400|± |0.0476|
| | |acc_norm|0.3400|± |0.0476|
|hendrycksTest-college_mathematics | 1|acc |0.3100|± |0.0465|
| | |acc_norm|0.3100|± |0.0465|
|hendrycksTest-college_medicine | 1|acc |0.3699|± |0.0368|
| | |acc_norm|0.3699|± |0.0368|
|hendrycksTest-college_physics | 1|acc |0.2647|± |0.0439|
| | |acc_norm|0.2647|± |0.0439|
|hendrycksTest-computer_security | 1|acc |0.1900|± |0.0394|
| | |acc_norm|0.1900|± |0.0394|
|hendrycksTest-conceptual_physics | 1|acc |0.2809|± |0.0294|
| | |acc_norm|0.2809|± |0.0294|
|hendrycksTest-econometrics | 1|acc |0.2544|± |0.0410|
| | |acc_norm|0.2544|± |0.0410|
|hendrycksTest-electrical_engineering | 1|acc |0.2552|± |0.0363|
| | |acc_norm|0.2552|± |0.0363|
|hendrycksTest-elementary_mathematics | 1|acc |0.2646|± |0.0227|
| | |acc_norm|0.2646|± |0.0227|
|hendrycksTest-formal_logic | 1|acc |0.2460|± |0.0385|
| | |acc_norm|0.2460|± |0.0385|
|hendrycksTest-global_facts | 1|acc |0.3000|± |0.0461|
| | |acc_norm|0.3000|± |0.0461|
|hendrycksTest-high_school_biology | 1|acc |0.3065|± |0.0262|
| | |acc_norm|0.3065|± |0.0262|
|hendrycksTest-high_school_chemistry | 1|acc |0.2562|± |0.0307|
| | |acc_norm|0.2562|± |0.0307|
|hendrycksTest-high_school_computer_science | 1|acc |0.2000|± |0.0402|
| | |acc_norm|0.2000|± |0.0402|
|hendrycksTest-high_school_european_history | 1|acc |0.2545|± |0.0340|
| | |acc_norm|0.2545|± |0.0340|
|hendrycksTest-high_school_geography | 1|acc |0.3232|± |0.0333|
| | |acc_norm|0.3232|± |0.0333|
|hendrycksTest-high_school_government_and_politics| 1|acc |0.3264|± |0.0338|
| | |acc_norm|0.3264|± |0.0338|
|hendrycksTest-high_school_macroeconomics | 1|acc |0.3590|± |0.0243|
| | |acc_norm|0.3590|± |0.0243|
|hendrycksTest-high_school_mathematics | 1|acc |0.2481|± |0.0263|
| | |acc_norm|0.2481|± |0.0263|
|hendrycksTest-high_school_microeconomics | 1|acc |0.3319|± |0.0306|
| | |acc_norm|0.3319|± |0.0306|
|hendrycksTest-high_school_physics | 1|acc |0.2583|± |0.0357|
| | |acc_norm|0.2583|± |0.0357|
|hendrycksTest-high_school_psychology | 1|acc |0.3450|± |0.0204|
| | |acc_norm|0.3450|± |0.0204|
|hendrycksTest-high_school_statistics | 1|acc |0.4491|± |0.0339|
| | |acc_norm|0.4491|± |0.0339|
|hendrycksTest-high_school_us_history | 1|acc |0.2451|± |0.0302|
| | |acc_norm|0.2451|± |0.0302|
|hendrycksTest-high_school_world_history | 1|acc |0.2363|± |0.0277|
| | |acc_norm|0.2363|± |0.0277|
|hendrycksTest-human_aging | 1|acc |0.1480|± |0.0238|
| | |acc_norm|0.1480|± |0.0238|
|hendrycksTest-human_sexuality | 1|acc |0.2748|± |0.0392|
| | |acc_norm|0.2748|± |0.0392|
|hendrycksTest-international_law | 1|acc |0.1570|± |0.0332|
| | |acc_norm|0.1570|± |0.0332|
|hendrycksTest-jurisprudence | 1|acc |0.2407|± |0.0413|
| | |acc_norm|0.2407|± |0.0413|
|hendrycksTest-logical_fallacies | 1|acc |0.3067|± |0.0362|
| | |acc_norm|0.3067|± |0.0362|
|hendrycksTest-machine_learning | 1|acc |0.2232|± |0.0395|
| | |acc_norm|0.2232|± |0.0395|
|hendrycksTest-management | 1|acc |0.3107|± |0.0458|
| | |acc_norm|0.3107|± |0.0458|
|hendrycksTest-marketing | 1|acc |0.2564|± |0.0286|
| | |acc_norm|0.2564|± |0.0286|
|hendrycksTest-medical_genetics | 1|acc |0.2900|± |0.0456|
| | |acc_norm|0.2900|± |0.0456|
|hendrycksTest-miscellaneous | 1|acc |0.2018|± |0.0144|
| | |acc_norm|0.2018|± |0.0144|
|hendrycksTest-moral_disputes | 1|acc |0.2543|± |0.0234|
| | |acc_norm|0.2543|± |0.0234|
|hendrycksTest-moral_scenarios | 1|acc |0.2380|± |0.0142|
| | |acc_norm|0.2380|± |0.0142|
|hendrycksTest-nutrition | 1|acc |0.2614|± |0.0252|
| | |acc_norm|0.2614|± |0.0252|
|hendrycksTest-philosophy | 1|acc |0.3055|± |0.0262|
| | |acc_norm|0.3055|± |0.0262|
|hendrycksTest-prehistory | 1|acc |0.2623|± |0.0245|
| | |acc_norm|0.2623|± |0.0245|
|hendrycksTest-professional_accounting | 1|acc |0.2837|± |0.0269|
| | |acc_norm|0.2837|± |0.0269|
|hendrycksTest-professional_law | 1|acc |0.2477|± |0.0110|
| | |acc_norm|0.2477|± |0.0110|
|hendrycksTest-professional_medicine | 1|acc |0.4412|± |0.0302|
| | |acc_norm|0.4412|± |0.0302|
|hendrycksTest-professional_psychology | 1|acc |0.2386|± |0.0172|
| | |acc_norm|0.2386|± |0.0172|
|hendrycksTest-public_relations | 1|acc |0.3273|± |0.0449|
| | |acc_norm|0.3273|± |0.0449|
|hendrycksTest-security_studies | 1|acc |0.3918|± |0.0313|
| | |acc_norm|0.3918|± |0.0313|
|hendrycksTest-sociology | 1|acc |0.2836|± |0.0319|
| | |acc_norm|0.2836|± |0.0319|
|hendrycksTest-us_foreign_policy | 1|acc |0.3100|± |0.0465|
| | |acc_norm|0.3100|± |0.0465|
|hendrycksTest-virology | 1|acc |0.2410|± |0.0333|
| | |acc_norm|0.2410|± |0.0333|
|hendrycksTest-world_religions | 1|acc |0.2865|± |0.0347|
| | |acc_norm|0.2865|± |0.0347|