Spaces:
Runtime error
Runtime error
Upload 4 files
Browse files
README.md
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
emoji: 💩
|
| 4 |
-
colorFrom:
|
| 5 |
colorTo: pink
|
| 6 |
sdk: streamlit
|
| 7 |
sdk_version: 1.17.0
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Instruction Model Outputs Filtered
|
| 3 |
emoji: 💩
|
| 4 |
+
colorFrom: blue
|
| 5 |
colorTo: pink
|
| 6 |
sdk: streamlit
|
| 7 |
sdk_version: 1.17.0
|
app.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import streamlit as st
|
| 6 |
+
import utils as ut
|
| 7 |
+
|
| 8 |
+
st.set_page_config(layout="wide")
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
st.markdown("# Elo Rating of Models")
|
| 12 |
+
st.markdown(
|
| 13 |
+
"""This app shows the Elo rating of models on the H4 Hub based on their performance on the H4 eval dataset. """)
|
| 14 |
+
st.markdown(
|
| 15 |
+
"""**Notes**
|
| 16 |
+
* This is currently using synthetic data
|
| 17 |
+
* You can tweak the number of tasks, models, and human rating per task to generate different datasets
|
| 18 |
+
"""
|
| 19 |
+
)
|
| 20 |
+
# user input
|
| 21 |
+
|
| 22 |
+
num_tasks = st.number_input("Number of tasks", min_value=1, max_value=5000, value=100)
|
| 23 |
+
num_models = st.number_input("Number of models", min_value=1, max_value=100, value=4)
|
| 24 |
+
num_human_ratings = st.number_input(
|
| 25 |
+
"Number of human ratings per task", min_value=1, max_value=10, value=3
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
button = st.button("Show me the leaderboard!")
|
| 29 |
+
|
| 30 |
+
if button is True:
|
| 31 |
+
# generate synthetic data
|
| 32 |
+
df = ut.create_synthetic_data( n_tasks=num_tasks, n_models=num_models, n_ratings=num_human_ratings)
|
| 33 |
+
# calculate elo rating
|
| 34 |
+
elo_df = ut.calculate_elo_rating(df)
|
| 35 |
+
# show leaderboard
|
| 36 |
+
ut.display_leaderboard(elo_df)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
datasets
|
| 2 |
+
python-dotenv
|
utils.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import streamlit as st
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def create_synthetic_data(n_tasks=100, n_models=4, n_ratings=3):
|
| 7 |
+
"""Create a synthetic dataframe with human ratings of model performance on a set of tasks.
|
| 8 |
+
|
| 9 |
+
Parameters
|
| 10 |
+
----------
|
| 11 |
+
n_tasks : int
|
| 12 |
+
The number of tasks.
|
| 13 |
+
n_models : int
|
| 14 |
+
The number of models.
|
| 15 |
+
n_ratings : int
|
| 16 |
+
The number of human ratings of model performance on a set of tasks.
|
| 17 |
+
|
| 18 |
+
Returns
|
| 19 |
+
-------
|
| 20 |
+
pandas.DataFrame
|
| 21 |
+
DataFrame containing human ratings of model performance on a set of tasks.
|
| 22 |
+
"""
|
| 23 |
+
# create a synthetic dataframe with 3 human ratings of 4 models performance on a set of 100 tasks
|
| 24 |
+
df = pd.DataFrame({'task': np.repeat(range(n_tasks), n_models * n_ratings),
|
| 25 |
+
'model': np.tile(np.repeat(range(n_models), n_ratings), n_tasks),
|
| 26 |
+
'rating': np.tile(np.random.randint(0, 5, n_models * n_ratings), n_tasks)})
|
| 27 |
+
# calculate score for each model
|
| 28 |
+
df['score'] = df.groupby(['task', 'model'])['rating'].transform('mean')
|
| 29 |
+
# calculate baseline score for each task
|
| 30 |
+
df['baseline'] = df.groupby('task')['score'].transform('min')
|
| 31 |
+
# calculate score for each model relative to baseline score
|
| 32 |
+
df['score'] = df['score'] - df['baseline']
|
| 33 |
+
# drop unnecessary columns
|
| 34 |
+
df = df.drop(['rating', 'baseline'], axis=1)
|
| 35 |
+
# drop duplicates
|
| 36 |
+
df = df.drop_duplicates()
|
| 37 |
+
return df
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def calculate_elo_rating(df, k=32, initial_rating=0):
|
| 41 |
+
"""Calculate ELORating for each model based on human ratings of model performance on a set of tasks.
|
| 42 |
+
|
| 43 |
+
Parameters
|
| 44 |
+
----------
|
| 45 |
+
df : pandas.DataFrame
|
| 46 |
+
DataFrame containing human ratings of model performance on a set of tasks.
|
| 47 |
+
k : int
|
| 48 |
+
The k-factor.
|
| 49 |
+
initial_rating : int
|
| 50 |
+
The initial rating.
|
| 51 |
+
|
| 52 |
+
Returns
|
| 53 |
+
-------
|
| 54 |
+
pandas.DataFrame
|
| 55 |
+
DataFrame containing ELORating for each model based on human ratings of model performance on a set of tasks.
|
| 56 |
+
"""
|
| 57 |
+
# calculate ELORating for each model based on human ratings of model performance on a set of tasks
|
| 58 |
+
# create a dat
|
| 59 |
+
df = df.copy()
|
| 60 |
+
# create a dataframe with all possible combinations of tasks and models
|
| 61 |
+
df_all = pd.DataFrame({'task': np.repeat(range(df['task'].max() + 1), df['model'].max() + 1),
|
| 62 |
+
'model': np.tile(range(df['model'].max() + 1), df['task'].max() + 1)})
|
| 63 |
+
# merge with original dataframe
|
| 64 |
+
df = df_all.merge(df, on=['task', 'model'], how='left')
|
| 65 |
+
# fill missing values with 0
|
| 66 |
+
df['score'] = df['score'].fillna(0)
|
| 67 |
+
# calculate expected score for each model
|
| 68 |
+
df['expected_score'] = df.groupby('model')['score'].transform(lambda x: 1 / (1 + 10 ** (-x / 400)))
|
| 69 |
+
# calculate actual score for each model
|
| 70 |
+
df['actual_score'] = df.groupby('model')['score'].transform(lambda x: x > 0).astype(int)
|
| 71 |
+
# calculate rating for each model
|
| 72 |
+
df['rating'] = df.groupby('model')['expected_score'].transform(lambda x: x * k + initial_rating)
|
| 73 |
+
# calculate rating change for each model
|
| 74 |
+
df['rating_change'] = df.groupby('model')['actual_score'].transform(lambda x: x * k)
|
| 75 |
+
# calculate new rating for each model
|
| 76 |
+
df['new_rating'] = df['rating'] + df['rating_change']
|
| 77 |
+
# drop unnecessary columns
|
| 78 |
+
df = df.drop(['score', 'expected_score', 'actual_score', 'rating', 'rating_change'], axis=1)
|
| 79 |
+
return df
|
| 80 |
+
|
| 81 |
+
def display_leaderboard(elo, n_models=4):
|
| 82 |
+
"""Display Elo rating for each model as a leaderboard based on their ranking.
|
| 83 |
+
|
| 84 |
+
Parameters
|
| 85 |
+
----------
|
| 86 |
+
elo : pandas.DataFrame
|
| 87 |
+
DataFrame containing ELORating for each model based on human ratings of model performance on a set of tasks.
|
| 88 |
+
n_models : int
|
| 89 |
+
The number of models.
|
| 90 |
+
"""
|
| 91 |
+
# calculate average Elo rating for each model
|
| 92 |
+
elo = elo.groupby('model')['new_rating'].mean().reset_index()
|
| 93 |
+
# sort models by Elo rating
|
| 94 |
+
elo = elo.sort_values('new_rating', ascending=False)
|
| 95 |
+
# add rank column
|
| 96 |
+
elo['rank'] = range(1, n_models + 1)
|
| 97 |
+
# display Elo rating for each model as a leaderboard based on their ranking
|
| 98 |
+
st.write(elo)
|