Spaces:

JustinLin610
/

ImageBind_zeroshot_demo

Runtime error

App Files Files Community

JustinLin610 commited on May 12, 2023

Commit

9dd993b

1 Parent(s): 5648cf2

add video

Browse files

Files changed (1) hide show

app.py +32 -4

app.py CHANGED Viewed

@@ -59,6 +59,30 @@ def audio_text_zeroshot(audio, text_list):
     return score_dict
 def inference(
     task,
     image=None,
@@ -69,6 +93,8 @@ def inference(
         result = image_text_zeroshot(image, text_list)
     elif task == "audio-text":
         result = audio_text_zeroshot(audio, text_list)
     else:
         raise NotImplementedError
     return result
@@ -80,6 +106,7 @@ def main():
             choices=[
                 "image-text",
                 "audio-text",
             ],
             type="value",
             default="image-text",
@@ -87,6 +114,7 @@ def main():
         ),
         gr.inputs.Image(type="filepath", label="Input image"),
         gr.inputs.Audio(type="filepath", label="Input audio"),
         gr.inputs.Textbox(lines=1, label="Candidate texts"),
     ]
@@ -95,10 +123,10 @@ def main():
         inputs,
         "label",
         examples=[
-            ["image-text", "assets/dog_image.jpg", None, "A dog|A car|A bird"],
-            ["image-text", "assets/car_image.jpg", None, "A dog|A car|A bird"],
-            ["audio-text", None, "assets/bird_audio.wav", "A dog|A car|A bird"],
-            ["audio-text", None, "assets/dog_audio.wav", "A dog|A car|A bird"],
         ],
         description="""<p>This is a simple demo of ImageBind for zero-shot cross-modal understanding (now including image classification and audio classification). Please refer to the original <a href='https://arxiv.org/abs/2305.05665' target='_blank'>paper</a> and <a href='https://github.com/facebookresearch/ImageBind' target='_blank'>repo</a> for more details.<br>
                     To test your own cases, you can upload an image or an audio, and provide the candidate texts separated by "|".<br>

     return score_dict
+def video_text_zeroshot(video, text_list):
+    video_paths = [video]
+    labels = [label.strip(" ") for label in text_list.strip(" ").split("|")]
+    inputs = {
+        ModalityType.TEXT: data.load_and_transform_text(labels, device),
+        ModalityType.VIDEO: data.load_and_transform_video_data(video_paths, device),
+    }
+    with torch.no_grad():
+        embeddings = model(inputs)
+    scores = (
+        torch.softmax(
+            embeddings[ModalityType.AUDIO] @ embeddings[ModalityType.TEXT].T, dim=-1
+        )
+        .squeeze(0)
+        .tolist()
+    )
+    score_dict = {label: score for label, score in zip(labels, scores)}
+    return score_dict
 def inference(
     task,
     image=None,
         result = image_text_zeroshot(image, text_list)
     elif task == "audio-text":
         result = audio_text_zeroshot(audio, text_list)
+    elif task == "video-text":
+        result = audio_text_zeroshot(audio, text_list)
     else:
         raise NotImplementedError
     return result
             choices=[
                 "image-text",
                 "audio-text",
+                "video-text",
             ],
             type="value",
             default="image-text",
         ),
         gr.inputs.Image(type="filepath", label="Input image"),
         gr.inputs.Audio(type="filepath", label="Input audio"),
+        gr.inputs.Video(type="filepath", label="Input video"),
         gr.inputs.Textbox(lines=1, label="Candidate texts"),
     ]
         inputs,
         "label",
         examples=[
+            ["image-text", "assets/dog_image.jpg", None, None, "A dog|A car|A bird"],
+            ["image-text", "assets/car_image.jpg", None, None, "A dog|A car|A bird"],
+            ["audio-text", None, "assets/bird_audio.wav", None, "A dog|A car|A bird"],
+            ["video-text", None, "assets/dog_video.mp4", None, "A dog|A car|A bird"],
         ],
         description="""<p>This is a simple demo of ImageBind for zero-shot cross-modal understanding (now including image classification and audio classification). Please refer to the original <a href='https://arxiv.org/abs/2305.05665' target='_blank'>paper</a> and <a href='https://github.com/facebookresearch/ImageBind' target='_blank'>repo</a> for more details.<br>
                     To test your own cases, you can upload an image or an audio, and provide the candidate texts separated by "|".<br>