Peiran commited on
Commit
579bdeb
·
1 Parent(s): 6a51e6d

Ensure no duplicate evaluations: idempotent submit, update pair_state to remove evaluated pair, and clear UI on completion; add graceful returns when no pairs

Browse files
Files changed (1) hide show
  1. app.py +40 -16
app.py CHANGED
@@ -379,6 +379,7 @@ def on_submit(
379
  ):
380
  if not task_name:
381
  return (
 
382
  gr.update(value=0),
383
  gr.update(value=""),
384
  gr.update(value=None),
@@ -391,6 +392,7 @@ def on_submit(
391
 
392
  if not pairs:
393
  return (
 
394
  gr.update(value=0, minimum=0, maximum=0, visible=False),
395
  gr.update(value=""),
396
  gr.update(value=None),
@@ -427,21 +429,40 @@ def on_submit(
427
  "model2_semantic_functional_alignment_score": int(a_semantic_score),
428
  "model2_overall_photorealism_score": int(a_overall_score),
429
  }
 
430
  row = _build_eval_row(pair, score_map)
431
- ok_local = _append_local_persist_csv(task_name, row)
432
- ok_hub, hub_msg = _upload_eval_record_to_dataset(task_name, row)
433
 
434
- next_index = min(index + 1, len(pairs) - 1)
435
- info = f"Saved evaluation for Test ID {pair['test_id']}."
436
- info += " Local persistence " + ("succeeded" if ok_local else "failed") + "."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437
  info += " Dataset upload " + ("succeeded" if ok_hub else "failed") + (f" ({hub_msg})" if hub_msg else "") + "."
438
 
439
- if next_index != index:
440
- pair = pairs[next_index]
 
441
  header = _format_pair_header(pair)
442
  a_path = pair["model2_path"] if pair.get("swap") else pair["model1_path"]
443
  b_path = pair["model1_path"] if pair.get("swap") else pair["model2_path"]
444
  return (
 
445
  gr.update(value=next_index),
446
  gr.update(value=header),
447
  _resolve_image_path(pair["org_img"]),
@@ -449,18 +470,20 @@ def on_submit(
449
  _resolve_image_path(b_path),
450
  3, 3, 3, 3,
451
  3, 3, 3, 3,
452
- gr.update(value=info + f" Moved to next pair ({next_index + 1}/{len(pairs)})."),
453
  )
454
 
 
455
  return (
456
- gr.update(),
457
- gr.update(),
458
- gr.update(),
459
- gr.update(),
460
- gr.update(),
 
461
  3, 3, 3, 3,
462
  3, 3, 3, 3,
463
- gr.update(value=info + " This is the last pair."),
464
  )
465
 
466
 
@@ -512,8 +535,8 @@ with gr.Blocks(title="VisArena Human Evaluation") as demo:
512
  b_semantic_input = gr.Slider(1, 5, value=3, step=1, label="B: Semantic/Functional Alignment")
513
  b_overall_input = gr.Slider(1, 5, value=3, step=1, label="B: Overall Photorealism")
514
 
515
- submit_button = gr.Button("Submit Evaluation", variant="primary")
516
- feedback_box = gr.Markdown("")
517
 
518
  # Event bindings
519
  task_selector.change(
@@ -574,6 +597,7 @@ with gr.Blocks(title="VisArena Human Evaluation") as demo:
574
  b_overall_input,
575
  ],
576
  outputs=[
 
577
  index_slider,
578
  pair_header,
579
  orig_image,
 
379
  ):
380
  if not task_name:
381
  return (
382
+ pairs,
383
  gr.update(value=0),
384
  gr.update(value=""),
385
  gr.update(value=None),
 
392
 
393
  if not pairs:
394
  return (
395
+ pairs,
396
  gr.update(value=0, minimum=0, maximum=0, visible=False),
397
  gr.update(value=""),
398
  gr.update(value=None),
 
429
  "model2_semantic_functional_alignment_score": int(a_semantic_score),
430
  "model2_overall_photorealism_score": int(a_overall_score),
431
  }
432
+ # Build record
433
  row = _build_eval_row(pair, score_map)
 
 
434
 
435
+ # Idempotency: check if this pair already evaluated; if so, skip writing
436
+ done_keys = _read_existing_eval_keys(task_name)
437
+ eval_key = (pair["test_id"], frozenset({pair["model1_name"], pair["model2_name"]}), pair["org_img"])
438
+ if eval_key in done_keys:
439
+ ok_local = False
440
+ ok_hub, hub_msg = (False, "Skipped duplicate; already evaluated.")
441
+ info_prefix = "Skipped duplicate submission."
442
+ else:
443
+ ok_local = _append_local_persist_csv(task_name, row)
444
+ # add key locally for subsequent filtering in this call
445
+ if ok_local:
446
+ done_keys.add(eval_key)
447
+ ok_hub, hub_msg = _upload_eval_record_to_dataset(task_name, row)
448
+ info_prefix = "Saved evaluation."
449
+
450
+ # Recompute remaining pairs by filtering current state against done_keys
451
+ def key_of(p: Dict[str, str]):
452
+ return (p["test_id"], frozenset({p["model1_name"], p["model2_name"]}), p["org_img"])
453
+ remaining_pairs = [p for p in pairs if key_of(p) not in done_keys]
454
+
455
+ info = f"{info_prefix} Local persistence " + ("succeeded" if ok_local else "skipped/failed") + "."
456
  info += " Dataset upload " + ("succeeded" if ok_hub else "failed") + (f" ({hub_msg})" if hub_msg else "") + "."
457
 
458
+ if remaining_pairs:
459
+ next_index = min(index, len(remaining_pairs) - 1)
460
+ pair = remaining_pairs[next_index]
461
  header = _format_pair_header(pair)
462
  a_path = pair["model2_path"] if pair.get("swap") else pair["model1_path"]
463
  b_path = pair["model1_path"] if pair.get("swap") else pair["model2_path"]
464
  return (
465
+ remaining_pairs,
466
  gr.update(value=next_index),
467
  gr.update(value=header),
468
  _resolve_image_path(pair["org_img"]),
 
470
  _resolve_image_path(b_path),
471
  3, 3, 3, 3,
472
  3, 3, 3, 3,
473
+ gr.update(value=info + f" Next pair ({next_index + 1}/{len(remaining_pairs)})."),
474
  )
475
 
476
+ # No remaining pairs: clear UI, hide slider, and return updated empty state
477
  return (
478
+ [],
479
+ gr.update(value=0, minimum=0, maximum=0, visible=False),
480
+ gr.update(value=""),
481
+ gr.update(value=None),
482
+ gr.update(value=None),
483
+ gr.update(value=None),
484
  3, 3, 3, 3,
485
  3, 3, 3, 3,
486
+ gr.update(value=info + " All pairs completed."),
487
  )
488
 
489
 
 
535
  b_semantic_input = gr.Slider(1, 5, value=3, step=1, label="B: Semantic/Functional Alignment")
536
  b_overall_input = gr.Slider(1, 5, value=3, step=1, label="B: Overall Photorealism")
537
 
538
+ submit_button = gr.Button("Submit Evaluation", variant="primary")
539
+ feedback_box = gr.Markdown("")
540
 
541
  # Event bindings
542
  task_selector.change(
 
597
  b_overall_input,
598
  ],
599
  outputs=[
600
+ pair_state,
601
  index_slider,
602
  pair_header,
603
  orig_image,