Commit
·
d57548f
1
Parent(s):
eb42cc9
update layout
Browse files- app.py +3 -3
- requirements.txt +1 -2
app.py
CHANGED
|
@@ -34,8 +34,8 @@ def get_dataframe_from_results(eval_path):
|
|
| 34 |
else:
|
| 35 |
df = df.sort_values(
|
| 36 |
by=["Verified", "Average SR"],
|
| 37 |
-
ascending=[False, False],
|
| 38 |
-
kind="mergesort"
|
| 39 |
)
|
| 40 |
|
| 41 |
for col in ['Easy', 'Medium', 'Hard', 'Average SR']:
|
|
@@ -283,7 +283,7 @@ with demo:
|
|
| 283 |
)
|
| 284 |
gr.Markdown("### Visualization")
|
| 285 |
gr.Markdown("This figure presents a fine-grained heatmap illustrating task-level completion across different agents. Each row corresponds to a specific agent, and each column represents a task (identified by its task ID). Blue bars indicate successful completions, while white spaces denote failures. Any agent: A task is considered successful if at least one agent is able to complete it. (This style of visualization is inspired by [HAL](https://hal.cs.princeton.edu/).)")
|
| 286 |
-
fig = plot_heatmap_with_performance_bar("./
|
| 287 |
gr.Plot(fig)
|
| 288 |
gr.Markdown(EVALUATION_DETAILS)
|
| 289 |
|
|
|
|
| 34 |
else:
|
| 35 |
df = df.sort_values(
|
| 36 |
by=["Verified", "Average SR"],
|
| 37 |
+
ascending=[False, False], # False 表示降序;Verified=True 会排到最上面
|
| 38 |
+
kind="mergesort" # 稳定排序,保证次序可预期
|
| 39 |
)
|
| 40 |
|
| 41 |
for col in ['Easy', 'Medium', 'Hard', 'Average SR']:
|
|
|
|
| 283 |
)
|
| 284 |
gr.Markdown("### Visualization")
|
| 285 |
gr.Markdown("This figure presents a fine-grained heatmap illustrating task-level completion across different agents. Each row corresponds to a specific agent, and each column represents a task (identified by its task ID). Blue bars indicate successful completions, while white spaces denote failures. Any agent: A task is considered successful if at least one agent is able to complete it. (This style of visualization is inspired by [HAL](https://hal.cs.princeton.edu/).)")
|
| 286 |
+
fig = plot_heatmap_with_performance_bar("./human_label_071625.json")
|
| 287 |
gr.Plot(fig)
|
| 288 |
gr.Markdown(EVALUATION_DETAILS)
|
| 289 |
|
requirements.txt
CHANGED
|
@@ -2,5 +2,4 @@ datasets
|
|
| 2 |
gradio
|
| 3 |
huggingface-hub
|
| 4 |
numpy
|
| 5 |
-
APScheduler
|
| 6 |
-
plotly
|
|
|
|
| 2 |
gradio
|
| 3 |
huggingface-hub
|
| 4 |
numpy
|
| 5 |
+
APScheduler
|
|
|