Spaces:

osunlp
/

Online_Mind2Web_Leaderboard

Running

WeijianQi1999 commited on Jul 16

Commit

d57548f

1 Parent(s): eb42cc9

update layout

Files changed (2) hide show

app.py CHANGED Viewed

@@ -34,8 +34,8 @@ def get_dataframe_from_results(eval_path):
     else:
         df = df.sort_values(
         by=["Verified", "Average SR"],
-        ascending=[False, False],
-        kind="mergesort"
     )
     for col in ['Easy', 'Medium', 'Hard', 'Average SR']:
@@ -283,7 +283,7 @@ with demo:
         )
         gr.Markdown("### Visualization")
         gr.Markdown("This figure presents a fine-grained heatmap illustrating task-level completion across different agents. Each row corresponds to a specific agent, and each column represents a task (identified by its task ID). Blue bars indicate successful completions, while white spaces denote failures. Any agent: A task is considered successful if at least one agent is able to complete it. (This style of visualization is inspired by [HAL](https://hal.cs.princeton.edu/).)")
-        fig = plot_heatmap_with_performance_bar("./human_label.json")
         gr.Plot(fig)
         gr.Markdown(EVALUATION_DETAILS)

     else:
         df = df.sort_values(
         by=["Verified", "Average SR"],
+        ascending=[False, False],      # False 表示降序；Verified=True 会排到最上面
+        kind="mergesort"              # 稳定排序，保证次序可预期
     )
     for col in ['Easy', 'Medium', 'Hard', 'Average SR']:
         )
         gr.Markdown("### Visualization")
         gr.Markdown("This figure presents a fine-grained heatmap illustrating task-level completion across different agents. Each row corresponds to a specific agent, and each column represents a task (identified by its task ID). Blue bars indicate successful completions, while white spaces denote failures. Any agent: A task is considered successful if at least one agent is able to complete it. (This style of visualization is inspired by [HAL](https://hal.cs.princeton.edu/).)")
+        fig = plot_heatmap_with_performance_bar("./human_label_071625.json")
         gr.Plot(fig)
         gr.Markdown(EVALUATION_DETAILS)

requirements.txt CHANGED Viewed

@@ -2,5 +2,4 @@ datasets
 gradio
 huggingface-hub
 numpy
-APScheduler
-plotly

 gradio
 huggingface-hub
 numpy
+APScheduler