Spaces:

HuggingFaceH4
/

Elo

Runtime error

App Files Files Community

nazneen commited on Feb 10, 2023

Commit

f1d79c0

1 Parent(s): c4e54a6

Upload 4 files

Browse files

Files changed (4) hide show

README.md +2 -2
app.py +42 -0
requirements.txt +2 -0
utils.py +98 -0

README.md CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-title: Elo
 emoji: 💩
-colorFrom: purple
 colorTo: pink
 sdk: streamlit
 sdk_version: 1.17.0

 ---
+title: Instruction Model Outputs Filtered
 emoji: 💩
+colorFrom: blue
 colorTo: pink
 sdk: streamlit
 sdk_version: 1.17.0

app.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import os
+from pathlib import Path
+import pandas as pd
+import streamlit as st
+import utils as ut
+st.set_page_config(layout="wide")
+st.markdown("# Elo Rating of Models")
+st.markdown(
+    """This app shows the Elo rating of models on the H4 Hub based on their performance on the H4 eval dataset. """)
+st.markdown(
+    """**Notes**
+* This is currently using synthetic data
+* You can tweak the number of tasks, models, and human rating per task to generate different datasets
+"""
+)
+# user input
+num_tasks = st.number_input("Number of tasks", min_value=1, max_value=5000, value=100)
+num_models = st.number_input("Number of models", min_value=1, max_value=100, value=4)
+num_human_ratings = st.number_input(
+    "Number of human ratings per task", min_value=1, max_value=10, value=3
+)
+button = st.button("Show me the leaderboard!")
+if button is True:
+    # generate synthetic data
+    df = ut.create_synthetic_data( n_tasks=num_tasks, n_models=num_models, n_ratings=num_human_ratings)
+    # calculate elo rating
+    elo_df = ut.calculate_elo_rating(df)
+    # show leaderboard
+    ut.display_leaderboard(elo_df)

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ datasets
2	+ python-dotenv

utils.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import numpy as np
+import pandas as pd
+import streamlit as st
+def create_synthetic_data(n_tasks=100, n_models=4, n_ratings=3):
+    """Create a synthetic dataframe with human ratings of model performance on a set of tasks.
+    Parameters
+    ----------
+    n_tasks : int
+        The number of tasks.
+    n_models : int
+        The number of models.
+    n_ratings : int
+        The number of human ratings of model performance on a set of tasks.
+    Returns
+    -------
+    pandas.DataFrame
+        DataFrame containing human ratings of model performance on a set of tasks.
+    """
+    # create a synthetic dataframe with 3 human ratings of 4 models performance on a set of 100 tasks
+    df = pd.DataFrame({'task': np.repeat(range(n_tasks), n_models * n_ratings),
+                       'model': np.tile(np.repeat(range(n_models), n_ratings), n_tasks),
+                       'rating': np.tile(np.random.randint(0, 5, n_models * n_ratings), n_tasks)})
+    # calculate score for each model
+    df['score'] = df.groupby(['task', 'model'])['rating'].transform('mean')
+    # calculate baseline score for each task
+    df['baseline'] = df.groupby('task')['score'].transform('min')
+    # calculate score for each model relative to baseline score
+    df['score'] = df['score'] - df['baseline']
+    # drop unnecessary columns
+    df = df.drop(['rating', 'baseline'], axis=1)
+    # drop duplicates
+    df = df.drop_duplicates()
+    return df
+def calculate_elo_rating(df, k=32, initial_rating=0):
+    """Calculate ELORating for each model based on human ratings of model performance on a set of tasks.
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        DataFrame containing human ratings of model performance on a set of tasks.
+    k : int
+        The k-factor.
+    initial_rating : int
+        The initial rating.
+    Returns
+    -------
+    pandas.DataFrame
+        DataFrame containing ELORating for each model based on human ratings of model performance on a set of tasks.
+    """
+    # calculate ELORating for each model based on human ratings of model performance on a set of tasks
+    # create a dat
+    df = df.copy()
+    # create a dataframe with all possible combinations of tasks and models
+    df_all = pd.DataFrame({'task': np.repeat(range(df['task'].max() + 1), df['model'].max() + 1),
+                            'model': np.tile(range(df['model'].max() + 1), df['task'].max() + 1)})
+    # merge with original dataframe
+    df = df_all.merge(df, on=['task', 'model'], how='left')
+    # fill missing values with 0
+    df['score'] = df['score'].fillna(0)
+    # calculate expected score for each model
+    df['expected_score'] = df.groupby('model')['score'].transform(lambda x: 1 / (1 + 10 ** (-x / 400)))
+    # calculate actual score for each model
+    df['actual_score'] = df.groupby('model')['score'].transform(lambda x: x > 0).astype(int)
+    # calculate rating for each model
+    df['rating'] = df.groupby('model')['expected_score'].transform(lambda x: x * k + initial_rating)
+    # calculate rating change for each model
+    df['rating_change'] = df.groupby('model')['actual_score'].transform(lambda x: x * k)
+    # calculate new rating for each model
+    df['new_rating'] = df['rating'] + df['rating_change']
+    # drop unnecessary columns
+    df = df.drop(['score', 'expected_score', 'actual_score', 'rating', 'rating_change'], axis=1)
+    return df
+def display_leaderboard(elo, n_models=4):
+    """Display Elo rating for each model as a leaderboard based on their ranking.
+    Parameters
+    ----------
+    elo : pandas.DataFrame
+        DataFrame containing ELORating for each model based on human ratings of model performance on a set of tasks.
+    n_models : int
+        The number of models.
+    """
+    # calculate average Elo rating for each model
+    elo = elo.groupby('model')['new_rating'].mean().reset_index()
+    # sort models by Elo rating
+    elo = elo.sort_values('new_rating', ascending=False)
+    # add rank column
+    elo['rank'] = range(1, n_models + 1)
+    # display Elo rating for each model as a leaderboard based on their ranking
+    st.write(elo)