Spaces:

JustinLin610
/

ImageBind_zeroshot_demo

Runtime error

App Files Files Community

JustinLin610 commited on May 12, 2023

Commit

179180d

1 Parent(s): a1ed4e4

reorder inputs

Browse files

Files changed (1) hide show

app.py +7 -7

app.py CHANGED Viewed

@@ -85,17 +85,17 @@ def video_text_zeroshot(video, text_list):
 def inference(
     task,
     image=None,
     audio=None,
     video=None,
-    text_list=None,
 ):
     if task == "image-text":
         result = image_text_zeroshot(image, text_list)
     elif task == "audio-text":
         result = audio_text_zeroshot(audio, text_list)
     elif task == "video-text":
-        result = audio_text_zeroshot(audio, text_list)
     else:
         raise NotImplementedError
     return result
@@ -113,10 +113,10 @@ def main():
             default="image-text",
             label="Task",
         ),
         gr.inputs.Image(type="filepath", label="Input image"),
         gr.inputs.Audio(type="filepath", label="Input audio"),
         gr.inputs.Video(type="filepath", label="Input video"),
-        gr.inputs.Textbox(lines=1, label="Candidate texts"),
     ]
     iface = gr.Interface(
@@ -124,10 +124,10 @@ def main():
         inputs,
         "label",
         examples=[
-            ["image-text", "assets/dog_image.jpg", None, None, "A dog|A car|A bird"],
-            ["image-text", "assets/car_image.jpg", None, None, "A dog|A car|A bird"],
-            ["audio-text", None, "assets/bird_audio.wav", None, "A dog|A car|A bird"],
-            ["video-text", None, "assets/dog_video.mp4", None, "A dog|A car|A bird"],
         ],
         description="""<p>This is a simple demo of ImageBind for zero-shot cross-modal understanding (now including image classification, audio classification, and video classification). Please refer to the original <a href='https://arxiv.org/abs/2305.05665' target='_blank'>paper</a> and <a href='https://github.com/facebookresearch/ImageBind' target='_blank'>repo</a> for more details.<br>
                     To test your own cases, you can upload an image, an audio or a video, and provide the candidate texts separated by "|".<br>

 def inference(
     task,
+    text_list=None,
     image=None,
     audio=None,
     video=None,
 ):
     if task == "image-text":
         result = image_text_zeroshot(image, text_list)
     elif task == "audio-text":
         result = audio_text_zeroshot(audio, text_list)
     elif task == "video-text":
+        result = video_text_zeroshot(video, text_list)
     else:
         raise NotImplementedError
     return result
             default="image-text",
             label="Task",
         ),
+        gr.inputs.Textbox(lines=1, label="Candidate texts"),
         gr.inputs.Image(type="filepath", label="Input image"),
         gr.inputs.Audio(type="filepath", label="Input audio"),
         gr.inputs.Video(type="filepath", label="Input video"),
     ]
     iface = gr.Interface(
         inputs,
         "label",
         examples=[
+            ["image-text", "A dog|A car|A bird", "assets/dog_image.jpg", None, None],
+            ["image-text", "A dog|A car|A bird", "assets/car_image.jpg", None, None],
+            ["audio-text", "A dog|A car|A bird",  None, "assets/bird_audio.wav", None],
+            ["video-text", "A dog|A car|A bird", None, "assets/dog_video.mp4", None],
         ],
         description="""<p>This is a simple demo of ImageBind for zero-shot cross-modal understanding (now including image classification, audio classification, and video classification). Please refer to the original <a href='https://arxiv.org/abs/2305.05665' target='_blank'>paper</a> and <a href='https://github.com/facebookresearch/ImageBind' target='_blank'>repo</a> for more details.<br>
                     To test your own cases, you can upload an image, an audio or a video, and provide the candidate texts separated by "|".<br>