diff --git a/auto_gpt_benchmarking/__main__.py b/auto_gpt_benchmarking/__main__.py index 06f5145ce39..c42c73b8e11 100644 --- a/auto_gpt_benchmarking/__main__.py +++ b/auto_gpt_benchmarking/__main__.py @@ -18,11 +18,14 @@ from pathlib import Path from datetime import datetime import yaml +from datetime import datetime + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser() - parser.add_argument("eval", type=str, help="Name of an eval. See registry.") + parser.add_argument( + "eval", type=str, help="Name of an eval. See registry.") parser.add_argument( "--completion-fn", type=str, @@ -47,20 +50,27 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument("--extra_eval_params", type=str, default="") parser.add_argument("--max_samples", type=int, default=None) - parser.add_argument("--cache", action=argparse.BooleanOptionalAction, default=True) - parser.add_argument("--visible", action=argparse.BooleanOptionalAction, default=None) + parser.add_argument( + "--cache", action=argparse.BooleanOptionalAction, default=True) + parser.add_argument( + "--visible", action=argparse.BooleanOptionalAction, default=None) parser.add_argument("--seed", type=int, default=20220722) parser.add_argument("--user", type=str, default="") - parser.add_argument("--record_path", type=str, default=str(Path(__file__).parent.parent / "data" / "records.jsonl")) + parser.add_argument("--record_path", type=str, default=str(Path( + __file__).parent.parent / "data" / f"eval-{datetime.now().strftime('%Y%m%d-%H%M%S')}.jsonl")) parser.add_argument( - "--log_to_file", type=str, default=None,#default=str( - # Path(__file__).parent.parent / "data" / "log" / "log.txt" - # ), help="Log to a file instead of stdout" + "--log_to_file", type=str, default=None, # default=str( + # Path(__file__).parent.parent / "data" / "log" / "log.txt" + # ), help="Log to a file instead of stdout" ) - parser.add_argument("--debug", action=argparse.BooleanOptionalAction, default=False) - parser.add_argument("--local-run", action=argparse.BooleanOptionalAction, default=True) - parser.add_argument("--dry-run", action=argparse.BooleanOptionalAction, default=False) - parser.add_argument("--dry-run-logging", action=argparse.BooleanOptionalAction, default=True) + parser.add_argument( + "--debug", action=argparse.BooleanOptionalAction, default=False) + parser.add_argument( + "--local-run", action=argparse.BooleanOptionalAction, default=True) + parser.add_argument( + "--dry-run", action=argparse.BooleanOptionalAction, default=False) + parser.add_argument("--dry-run-logging", + action=argparse.BooleanOptionalAction, default=True) return parser.parse_args() @@ -76,7 +86,8 @@ def update_yaml_with_auto_gpt_path(yaml_path: str, auto_gpt_path: str or None) - with open(yaml_path, "r") as f: yaml_data = yaml.safe_load(f) if yaml_data["auto_gpt_completion_fn"]["args"]["auto_gpt_path"] is None and auto_gpt_path is None: - raise Exception("You must specify a auto_gpt_path in the yaml file or pass it in as a parameter") + raise Exception( + "You must specify a auto_gpt_path in the yaml file or pass it in as a parameter") if auto_gpt_path is None: auto_gpt_path = yaml_data["auto_gpt_completion_fn"]["args"]["auto_gpt_path"] if auto_gpt_path is not None: @@ -108,7 +119,8 @@ def load_env_file(env_path: Path): # Update the yaml file with the auto_gpt_path autogpt_path = update_yaml_with_auto_gpt_path( - str(Path(__file__).parent / "completion_fns" / "auto_gpt_completion_fn.yaml"), + str(Path(__file__).parent / "completion_fns" / + "auto_gpt_completion_fn.yaml"), args.auto_gpt_path ) diff --git a/evals_analytics.ipynb b/evals_analytics.ipynb new file mode 100644 index 00000000000..f1b48424c13 --- /dev/null +++ b/evals_analytics.ipynb @@ -0,0 +1,220 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEWCAYAAAB8LwAVAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAc5klEQVR4nO3deZgddZ3v8feHALLKYoLGrCiIoBDABlRQQBEiKsHHLVEUvGDmOuK43RnBmQcYnHH0ehVHwYHoRERlEREmwwQRRhEVkTSILEEkRiAJYBrCpiAY+Nw/6tfDSVOdPunu6pNOf17Pc56u+v1q+Vaf5Hy6llMl20RERPS1UacLiIiI9VMCIiIiaiUgIiKiVgIiIiJqJSAiIqJWAiIiImolICI6TNJrJN3e6Toi+kpAREdJukrSg5Ke0+lamiDpIEnLa9qvknQcgO2f2t6ljWWdIunbTdQZUScBER0jaTrwGsDAESO87o1Hcn3ru/w+ok4CIjrpfcC1wNnA0a0dkqZI+r6kHkkPSDq9pe8Dkm6T9KikxZL2Lu2WtFPLdGdL+qcyfJCk5ZI+Kek+4BuStpN0aVnHg2V4csv820v6hqR7Sv8lpf0WSW9pmW4TSfdL2mswv4S+exmlxhVl+26X9HpJM4FPAe+S9EdJvy7TvlDSAkmrJC2R9IGW5Wwu6Zul9tsk/V2f9dxZ1nUT8CdJG0s6QdLvWn63b22Z/hhJP5d0mqSHJC2V9OrSvkzSSklrvI8xuiUgopPeB3ynvA6T9HwASeOAS4G7gOnAJOD80vcO4JQy73Op9jweaHN9LwC2B6YBc6n+/X+jjE8FHgdOb5n+W8AWwMuAHYDTSvs5wFEt0x0O3Gv7V23W0S9JuwDHA/vY3ho4DLjT9g+AzwAX2N7K9owyy/nAcuCFwNuBz0h6Xek7mer39yLgDX1q7jUHeBOwre3VwO+o9uq2Af4R+LakiS3T7wfcBDwPOLesfx9gp7L80yVtNdTfQ6wnbOeV14i/gAOAvwDjy/hvgI+V4VcBPcDGNfNdDnykn2Ua2Kll/Gzgn8rwQcCTwGZrqWlP4MEyPBF4GtiuZroXAo8Czy3j3wP+rp9lHlSW81Cf12rguJZplpfhnYCVwCHAJn2WdQrw7ZbxKcBTwNYtbf8CnF2GlwKHtfQd17ueMn4n8L8GeJ9uBGaV4WOAO1r6di+/8+e3tD0A7Nnpf195Dc8rexDRKUcDP7R9fxk/l2cOM00B7nL1F21fU6j+yh2MHtt/7h2RtIWksyTdJekR4Gpg27IHMwVYZfvBvguxfQ/wc+BtkrYF3ki1F9Sfe2xv2/oCflY3oe0lwEepwmClpPMlvbCf5b6w1PhoS9tdVHtcvf3LWvpah2vbJL1P0o3lENJDwMuB8S2T/KFl+PFSc9+27EFsIBIQMeIkbQ68EzhQ0n3lnMDHgBmSZlB9aE3t58TpMuDF/Sz6MapDQr1e0Ke/762LPwHsAuxn+7nAa3tLLOvZvgRAnW9SHVJ5B/AL2yv6mW6d2T7X9gFUh74MfK6f+u8pNW7d0jYV6K3lXmByS9+UutX1DkiaBnyN6hDX80qQ3UL1+4gxKAERnXAk1aGR3agO6+wJ7Ar8lOrcwnVUH26flbSlpM0k7V/m/TrwfyS9QpWdygcbVIdD3i1pXDmpe+AAdWxN9RfvQ5K2pzpmD4Dte4HLgK+Wk9mbSHpty7yXAHsDH6E6JzEsJO0i6XWqLvv9c6nv6dL9B2C6pI1KjcuAa4B/Kb+jPYBjgd5LYb8LnFjqn0T1wb82W1IFRk+p5f1UexAxRiUgohOOBr5h+27b9/W+qE4Qv4fqL9a3UB2Pv5vqJOy7AGxfCPwz1SGpR6k+qLcvy/1Ime+hspxLBqjjS8DmwP1UV1P9oE//e6nOk/yG6rzAR3s7bD8OXATsCHy/7S0f2HOAz5aa7qM6OX5i6buw/HxA0g1leA7Vieh7gIuBk21fWfpOpfrd/R64kupcyRP9rdj2YuALwC+owmh3qkNpMUbJzgODIgZD0knAS2zXXR203pH0QWC27YH2rCKA7EFEDEo5JHUsMK/TtfRH0kRJ+0vaqFw++wmqvYyItiQgItZR+TLaMuAy21d3up612BQ4i+pQ3I+A/wC+2tGKYlTJIaaIiKiVPYiIiKi1Qd2ga/z48Z4+fXqny4iIGDWuv/76+21PqOvboAJi+vTpdHd3d7qMiIhRQ9Jd/fXlEFNERNRKQERERK0ERERE1EpARERErQRERETUSkBEREStxgJC1TOFf1yea3urpI/UTCNJXy7P0r1J5dnCpe9oSXeUV55zGxExwpr8HsRq4BO2bygPNLle0hXllsK93gjsXF77Af8G7Ndyb/4uqvvTXy9pQd3TvSIiohmN7UHYvtf2DWX4UeA2nnkUYq9ZwDmuXEv1uMeJVA9qv8J27yMfrwBmNlVrREQ824icg5A0HdgL+GWfrkms+Uzc5aWtv/a6Zc+V1C2pu6enZ9hqjohYn02cPBVJSGLi5KmNrKPxgJC0FdWTtz5q+5HhXr7teba7bHdNmFB7O5GIiA3OfSuWMe2TlzLtk5dy34plA88wCI0GhKRNqMLhO7brHsu4gjUfpD65tPXXHhERI6TJq5gE/Dtwm+0v9jPZAuB95WqmVwIPl4fFXw4cWh62vh1waGmLiIgR0uRVTPtTPfT9Zkk3lrZPAVMBbJ8JLAQOB5YAjwHvL32rJH0aWFTmO9X2qgZrjYiIPhoLCNs/AzTANAY+1E/ffGB+A6VFREQb8k3qiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFqNPTBI0nzgzcBK2y+v6f9b4D0tdewKTChPk7sTeBR4Clhtu6upOiMiol6TexBnAzP767T9edt72t4TOBH4SZ/Hih5c+hMOEREd0FhA2L4aaPc50nOA85qqJSIi1l3Hz0FI2oJqT+OilmYDP5R0vaS5naksImJsa+wcxDp4C/DzPoeXDrC9QtIOwBWSflP2SJ6lBMhcgKlTpzZfbUTEGNHxPQhgNn0OL9leUX6uBC4G9u1vZtvzbHfZ7powYUKjhUZEjCUdDQhJ2wAHAv/R0ralpK17h4FDgVs6U2FExNjV5GWu5wEHAeMlLQdOBjYBsH1mmeytwA9t/6ll1ucDF0vqre9c2z9oqs6IiKjXWEDYntPGNGdTXQ7b2rYUmNFMVRER0a714RxERESshxIQERFRKwERERG1EhAREVErAREREbUSEBERUSsBERERtRIQERFRKwERERG1EhAREVErAREREbUSEBERUSsBERERtRIQERFRKwERERG1EhAREVErAREREbUaCwhJ8yWtlFT7PGlJB0l6WNKN5XVSS99MSbdLWiLphKZqjIiI/jW5B3E2MHOAaX5qe8/yOhVA0jjgDOCNwG7AHEm7NVhnRETUaCwgbF8NrBrErPsCS2wvtf0kcD4wa1iLi4iIAXX6HMSrJP1a0mWSXlbaJgHLWqZZXtpqSZorqVtSd09PT5O1RkSMKZ0MiBuAabZnAF8BLhnMQmzPs91lu2vChAnDWV9ExJjWsYCw/YjtP5bhhcAmksYDK4ApLZNOLm0RETGCOhYQkl4gSWV431LLA8AiYGdJO0raFJgNLOhUnRERY9XGTS1Y0nnAQcB4ScuBk4FNAGyfCbwd+KCk1cDjwGzbBlZLOh64HBgHzLd9a1N1RkREvcYCwvacAfpPB07vp28hsLCJuiIioj2dvoopIiLWUwmIiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFqNBYSk+ZJWSrqln/73SLpJ0s2SrpE0o6XvztJ+o6TupmqMiIj+tRUQknYfxLLPBmaupf/3wIG2dwc+Dczr03+w7T1tdw1i3RERMUTt7kF8VdJ1kv5a0jbtzGD7amDVWvqvsf1gGb0WmNxmLRERMQLaCgjbrwHeA0wBrpd0rqQ3DGMdxwKXta4S+KGk6yXNXduMkuZK6pbU3dPTM4wlRUSMbRu3O6HtOyT9A9ANfBnYS5KAT9n+/mALkHQwVUAc0NJ8gO0VknYArpD0m7JHUlfXPMrhqa6uLg+2joiIWFO75yD2kHQacBvwOuAttnctw6cNduWS9gC+Dsyy/UBvu+0V5edK4GJg38GuIyIiBqfdcxBfAW4AZtj+kO0bAGzfA/zDYFYsaSrwfeC9tn/b0r6lpK17h4FDgdoroSIiojntHmJ6E/C47acAJG0EbGb7MdvfqptB0nnAQcB4ScuBk4FNAGyfCZwEPI/qBDjA6nLF0vOBi0vbxsC5tn8wuM2LiIjBajcgrgQOAf5YxrcAfgi8ur8ZbM9Z2wJtHwccV9O+FJjx7DkiImIktXuIaTPbveFAGd6imZIiImJ90G5A/EnS3r0jkl4BPN5MSRERsT5o9xDTR4ELJd0DCHgB8K6mioqIiM5rKyBsL5L0UmCX0nS77b80V1ZERHRa21+UA/YBppd59paE7XMaqSoiIjqurYCQ9C3gxcCNwFOl2UACIiJiA9XuHkQXsJvt3MoiImKMaPcqpluoTkxHRMQY0e4exHhgsaTrgCd6G20f0UhVERHRce0GxClNFhEREeufdi9z/YmkacDOtq+UtAUwrtnSIiKik9q93fcHgO8BZ5WmScAlDdUUERHrgXZPUn8I2B94BKqHBwE7NFVURER0XrsB8YTtJ3tHJG1M9T2IiIjYQLUbED+R9Clg8/Is6guB/2yurIiI6LR2A+IEoAe4GfgrYCGDfJJcRESMDu1exfQ08LXyioiIMaDdq5h+L2lp31cb882XtFJS7TOlVfmypCWSburzzImjJd1RXke3v0kRETEc1uVeTL02A94BbN/GfGcDp9P/Tf3eCOxcXvsB/wbsJ2l7qmdYd1GdDL9e0gLbD7ZZb0REDFFbexC2H2h5rbD9JeBNbcx3NbBqLZPMAs5x5VpgW0kTgcOAK2yvKqFwBTCznVojImJ4tHuIae+WV5ek/826PUuiP5OAZS3jy0tbf+11tc2V1C2pu6enZxhKinZMnDwVSUhi4uSpo275se7ynow97X7If6FleDVwJ/DOYa9mEGzPA+YBdHV15bsZI+S+FcuY9slLAbjrc28edcuPdZf3ZOxp9yqmgxta/wpgSsv45NK2AjioT/tVDdUQERE12n2i3MfX1m/7i4Nc/wLgeEnnU52kftj2vZIuBz4jabsy3aHAiYNcR0REDMK6XMW0D9UHOsBbgOuAO9Y2k6TzqPYExktaTnVl0iYAts+k+sLd4cAS4DHg/aVvlaRPA4vKok61vbaT3RERMczaDYjJwN62HwWQdArwX7aPWttMtucM0G+qGwHW9c0H5rdZX0REDLN2b7XxfODJlvEnS1tERGyg2t2DOAe4TtLFZfxI4JuNVBQREeuFdq9i+mdJlwGvKU3vt/2r5sqKiIhOa/cQE8AWwCO2/xVYLmnHhmqKiIj1QLvfpD4Z+CTPXGq6CfDtpoqKiIjOa3cP4q3AEcCfAGzfA2zdVFEREdF57QbEk+WSVANI2rK5kiIiYn3QbkB8V9JZVHdb/QBwJXl4UETEBm3Aq5gkCbgAeCnwCLALcJLtKxquLSIiOmjAgLBtSQtt7071XIaIiBgD2j3EdIOkfRqtJCIi1ivtfpN6P+AoSXdSXckkqp2LPZoqLCIiOmutASFpqu27qR4BGhERY8hAexCXUN3F9S5JF9l+2wjUFBER64GBzkGoZfhFTRYSERHrl4ECwv0MR0TEBm6gQ0wzJD1CtSexeRmGZ05SP7fR6iIiomPWGhC2xw1l4ZJmAv8KjAO+bvuzffpPAw4uo1sAO9jetvQ9Bdxc+u62fcRQaomIiHXT7mWu60zSOOAM4A3AcmCRpAW2F/dOY/tjLdN/GNirZRGP296zqfoiImLt1uV5EOtqX2CJ7aW2nwTOB2atZfo5wHkN1hMREeugyYCYBCxrGV9e2p5F0jRgR+BHLc2bSeqWdK2kI/tbiaS5Zbrunp6eYSg7IiKg2YBYF7OB79l+qqVtmu0u4N3AlyS9uG5G2/Nsd9numjBhwkjUGhExJjQZECuAKS3jk0tbndn0Obxke0X5uRS4ijXPT0RERMOaDIhFwM6SdpS0KVUILOg7kaSXAtsBv2hp207Sc8rweGB/YHHfeSMiojmNXcVke7Wk44HLqS5znW/7VkmnAt22e8NiNnB+eWJdr12BsyQ9TRVin229+ikiIprXWEAA2F4ILOzTdlKf8VNq5rsG2L3J2iIiYu3Wl5PUERGxnklARERErQRERETUSkBEREStBERERNRKQERERK0ERERE1EpARERErQRERETUSkBEREStBERERNRKQERERK0ERERE1EpARERErQRERETUSkBEREStBERERNRqNCAkzZR0u6Qlkk6o6T9GUo+kG8vruJa+oyXdUV5HN1lnREQ8W2OPHJU0DjgDeAOwHFgkaUHNs6UvsH18n3m3B04GugAD15d5H2yq3oiIWFOTexD7AktsL7X9JHA+MKvNeQ8DrrC9qoTCFcDMhuqMiIgaTQbEJGBZy/jy0tbX2yTdJOl7kqas47xImiupW1J3T0/PcNQdERF0/iT1fwLTbe9BtZfwzXVdgO15trtsd02YMGHYC4yIGKuaDIgVwJSW8cml7X/YfsD2E2X068Ar2p03IiKa1WRALAJ2lrSjpE2B2cCC1gkkTWwZPQK4rQxfDhwqaTtJ2wGHlraIiBghjV3FZHu1pOOpPtjHAfNt3yrpVKDb9gLgbyQdAawGVgHHlHlXSfo0VcgAnGp7VVO1RkTEszUWEAC2FwIL+7Sd1DJ8InBiP/POB+Y3WV9ERPSv0yepIyJiPZWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKjVaEBIminpdklLJJ1Q0/9xSYsl3STpvyVNa+l7StKN5bWg77wREdGsxh45KmkccAbwBmA5sEjSAtuLWyb7FdBl+zFJHwT+L/Cu0ve47T2bqi8iItauyT2IfYEltpfafhI4H5jVOoHtH9t+rIxeC0xusJ6IiFgHTQbEJGBZy/jy0tafY4HLWsY3k9Qt6VpJR/Y3k6S5Zbrunp6eIRUcERHPaOwQ07qQdBTQBRzY0jzN9gpJLwJ+JOlm27/rO6/tecA8gK6uLo9IwRERY0CTexArgCkt45NL2xokHQL8PXCE7Sd6222vKD+XAlcBezVYa0RE9NFkQCwCdpa0o6RNgdnAGlcjSdoLOIsqHFa2tG8n6TlleDywP9B6cjsiIhrW2CEm26slHQ9cDowD5tu+VdKpQLftBcDnga2ACyUB3G37CGBX4CxJT1OF2Gf7XP0UERENa/QchO2FwMI+bSe1DB/Sz3zXALs3WVtERKxdvkkdERG1EhAREVErAREREbUSEBERUSsBERERtRIQERFRKwERERG1EhAREVErAREREbUSEBERUSsBERERtRIQERFRKwERERG1EhAREVErAREREbUSEBERUSsBERERtRoNCEkzJd0uaYmkE2r6nyPpgtL/S0nTW/pOLO23SzqsyTojIuLZGgsISeOAM4A3ArsBcyTt1meyY4EHbe8EnAZ8rsy7GzAbeBkwE/hqWV5ERIyQJvcg9gWW2F5q+0ngfGBWn2lmAd8sw98DXi9Jpf1820/Y/j2wpCwvIiJGyMYNLnsSsKxlfDmwX3/T2F4t6WHgeaX92j7zTqpbiaS5wNwy+kdJtw+y3vHA/YOcd7Qa0jbf9bk3/89wlevDq6Hlj7X3eVi3t+n3fJiMmfe45f0YL2mw2zytv44mA2JE2J4HzBvqciR12+4ahpJGjWzzhm+sbS9km4dTk4eYVgBTWsYnl7baaSRtDGwDPNDmvBER0aAmA2IRsLOkHSVtSnXSeUGfaRYAR5fhtwM/su3SPrtc5bQjsDNwXYO1RkREH40dYirnFI4HLgfGAfNt3yrpVKDb9gLg34FvSVoCrKIKEcp03wUWA6uBD9l+qqlaiyEfphqFss0bvrG2vZBtHjaq/mCPiIhYU75JHRERtRIQERFRa8wFxFBu/zEatbG9H5e0WNJNkv5bUr/XRI8WA21zy3Rvk2RJo/6SyHa2WdI7y3t9q6RzR7rG4dbGv+2pkn4s6Vfl3/fhnahzuEiaL2mlpFv66ZekL5ffx02S9h7ySm2PmRfVyfLfAS8CNgV+DezWZ5q/Bs4sw7OBCzpdd8PbezCwRRn+4Gje3na3uUy3NXA11Rcyuzpd9wi8zzsDvwK2K+M7dLruEdjmecAHy/BuwJ2drnuI2/xaYG/gln76DwcuAwS8EvjlUNc51vYghnL7j9FowO21/WPbj5XRa6m+czKatfMeA3ya6t5ffx7J4hrSzjZ/ADjD9oMAtleOcI3DrZ1tNvDcMrwNcM8I1jfsbF9NdbVnf2YB57hyLbCtpIlDWedYC4i623/0vYXHGrf/AHpv/zEatbO9rY6l+gtkNBtwm8uu9xTb/zWShTWonff5JcBLJP1c0rWSZo5Ydc1oZ5tPAY6StBxYCHx4ZErrmHX9/z6gUX+rjRgeko4CuoADO11LkyRtBHwROKbDpYy0jakOMx1EtZd4taTdbT/UyaIaNgc42/YXJL2K6jtXL7f9dKcLGy3G2h7EUG7/MRq1dcsSSYcAfw8cYfuJEaqtKQNt89bAy4GrJN1Jdax2wSg/Ud3O+7wcWGD7L67ukPxbqsAYrdrZ5mOB7wLY/gWwGdWN/DZUw36LorEWEEO5/cdoNOD2StoLOIsqHEb7cWkYYJttP2x7vO3ptqdTnXc5wnZ3Z8odFu38u76Eau8BSeOpDjktHcEah1s723w38HoASbtSBUTPiFY5shYA7ytXM70SeNj2vUNZ4Jg6xOQh3P5jNGpzez8PbAVcWM7F3237iI4VPURtbvMGpc1tvhw4VNJi4Cngb22P1j3jdrf5E8DXJH2M6oT1MaP4jz0knUcV8uPLeZWTgU0AbJ9JdZ7lcKrn5zwGvH/I6xzFv6+IiGjQWDvEFBERbUpARERErQRERETUSkBEREStBERERNRKQET0IenIcpfXl3a6lohOSkBEPNsc4GflZyMkjWtq2RHDJQER0ULSVsABVLdpmF3axkn6f5JuKffZ/3Bp30fSNZJ+Lek6SVtLOkbS6S3Lu1TSQWX4j5K+IOnXwKsknSRpUVnuvN67BkvaSdKVZbk3SHqxpHMkHdmy3O9IqrtLbcSwSUBErGkW8APbvwUekPQKYC4wHdjT9h7Ad8rtHS4APmJ7BnAI8PgAy96S6h79M2z/DDjd9j62Xw5sDry5TPcdqltzzwBeDdxL9Q3/YwAkbVPaN5S70cZ6KgERsaY5VM8WoPycQ/Xhf1a5/Tu2VwG7APfaXlTaHuntX4ungItaxg9W9dTCm4HXAS+TtDUwyfbFZbl/tv2Y7Z9Q3XtoQqnpojbWFzEkY+peTBFrI2l7qg/q3SWZ6h4/proxXLtWs+YfXpu1DP/Z9lNlXZsBX6V6mt0ySaf0mbbOOcBRVIe+hnyfnYiBZA8i4hlvB75le1q52+sU4PdUj7P8q3L7994guR2YKGmf0rZ16b8T2FPSRpKmUD35rE5vGNxfznu8HcD2o8Dy3vMNqp6RvkWZ9mzgo2W6xcO21RH9SEBEPGMOcHGftouAiVS3jr6pnGB+d3nM5buAr5S2K6g+9H9OFSqLgS8DN9StqDyo52vALVR3JG3dS3kv8DeSbgKuAV5Q5vkDcBvwjaFuaEQ7cjfXiFGi7EncDOxt++FO1xMbvuxBRIwC5al/twFfSTjESMkeRERE1MoeRERE1EpARERErQRERETUSkBEREStBERERNT6/5WLAWlxQhHkAAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import os\n", + "import json\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def load_jsonl_files_recursively(dir_path):\n", + " all_data = []\n", + " \n", + " for root, _, files in os.walk(dir_path):\n", + " for file in files:\n", + " if file.endswith(\".jsonl\"):\n", + " file_path = os.path.join(root, file)\n", + " with open(file_path, \"r\") as f:\n", + " file_data = [json.loads(line) for line in f]\n", + " all_data.extend(file_data)\n", + " \n", + " return all_data\n", + "\n", + "def extract_accuracies(data):\n", + " accuracies = []\n", + " for record in data:\n", + " if 'final_report' in record:\n", + " accuracy = record['final_report']['accuracy']\n", + " accuracies.append(accuracy)\n", + " return accuracies\n", + "\n", + "# Load the data recursively\n", + "dir_path = \"evals\"\n", + "data = load_jsonl_files_recursively(dir_path)\n", + "\n", + "# Extract accuracies from the data\n", + "accuracies = extract_accuracies(data)\n", + "\n", + "# Plot the accuracies in a histogram chart\n", + "plt.hist(accuracies, bins=100, range=(0, 1), edgecolor='black')\n", + "plt.xlabel(\"Accuracy\")\n", + "plt.ylabel(\"Frequency\")\n", + "plt.title(\"Accuracy Histogram\")\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Counts for each eval_name:\n", + "test-match.s1.simple-v0: 22\n", + "None: 45\n", + "test-fuzzy-match.s1.simple-v0: 2\n" + ] + } + ], + "source": [ + "import os\n", + "import json\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def print_graph():\n", + " directory = 'evals/'\n", + " files = os.listdir(directory)\n", + " \n", + " eval_name_counter = {}\n", + " \n", + " for file in files:\n", + " if file.endswith(\".jsonl\"):\n", + " with open(os.path.join(directory, file), 'r') as f:\n", + " jsonl_content = f.read()\n", + " \n", + " # Read the JSONL content into a DataFrame\n", + " data = [json.loads(line) for line in jsonl_content.split('\\n') if line]\n", + " df = pd.DataFrame(data)\n", + "\n", + " if 'spec' not in df.columns:\n", + " continue\n", + "\n", + " # Extract the \"eval_name\" from the \"spec\" dictionaries\n", + " df['eval_name'] = df['spec'].apply(lambda x: x['eval_name'] if isinstance(x, dict) else None)\n", + "\n", + " for eval_name in df['eval_name']:\n", + " if eval_name not in eval_name_counter:\n", + " eval_name_counter[eval_name] = 0\n", + " eval_name_counter[eval_name] += 1\n", + "\n", + " # Print the counts\n", + " print(\"Counts for each eval_name:\")\n", + " for eval_name, count in eval_name_counter.items():\n", + " print(f\"{eval_name}: {count}\")\n", + "\n", + "print_graph()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Counts for each eval_name:\n", + "test-match.s1.simple-v0: 22\n", + "test-fuzzy-match.s1.simple-v0: 2\n" + ] + } + ], + "source": [ + "import os\n", + "import json\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def print_graph():\n", + " directory = 'evals/'\n", + " files = os.listdir(directory)\n", + " \n", + " eval_name_counter = {}\n", + " \n", + " for file in files:\n", + " if file.endswith(\".jsonl\"):\n", + " with open(os.path.join(directory, file), 'r') as f:\n", + " jsonl_content = f.read()\n", + " \n", + " # Read the JSONL content into a DataFrame\n", + " data = [json.loads(line) for line in jsonl_content.split('\\n') if line]\n", + " df = pd.DataFrame(data)\n", + "\n", + " if 'spec' not in df.columns:\n", + " continue\n", + "\n", + " # Filter the DataFrame to only include rows with the \"spec\" key\n", + " spec_df = df[df['spec'].notna()].copy()\n", + "\n", + " # Extract the \"eval_name\" from the \"spec\" dictionaries\n", + " spec_df.loc[:, 'eval_name'] = spec_df['spec'].apply(lambda x: x['eval_name'])\n", + "\n", + " for eval_name in spec_df['eval_name']:\n", + " if eval_name not in eval_name_counter:\n", + " eval_name_counter[eval_name] = 0\n", + " eval_name_counter[eval_name] += 1\n", + "\n", + " # Print the counts\n", + " print(\"Counts for each eval_name:\")\n", + " for eval_name, count in eval_name_counter.items():\n", + " print(f\"{eval_name}: {count}\")\n", + "\n", + "print_graph()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.0 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.0" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}