Spaces:
Runtime error
Runtime error
| import data | |
| import torch | |
| import gradio as gr | |
| from models import imagebind_model | |
| from models.imagebind_model import ModalityType | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| model = imagebind_model.imagebind_huge(pretrained=True) | |
| model.eval() | |
| model.to(device) | |
| def image_text_zeroshot(image, text_list): | |
| image_paths = [image] | |
| labels = [label.strip(" ") for label in text_list.strip(" ").split("|")] | |
| inputs = { | |
| ModalityType.TEXT: data.load_and_transform_text(labels, device), | |
| ModalityType.VISION: data.load_and_transform_vision_data(image_paths, device), | |
| } | |
| with torch.no_grad(): | |
| embeddings = model(inputs) | |
| scores = ( | |
| torch.softmax( | |
| embeddings[ModalityType.VISION] @ embeddings[ModalityType.TEXT].T, dim=-1 | |
| ) | |
| .squeeze(0) | |
| .tolist() | |
| ) | |
| score_dict = {label: score for label, score in zip(labels, scores)} | |
| return score_dict | |
| def audio_text_zeroshot(audio, text_list): | |
| audio_paths = [audio] | |
| labels = [label.strip(" ") for label in text_list.strip(" ").split("|")] | |
| inputs = { | |
| ModalityType.TEXT: data.load_and_transform_text(labels, device), | |
| ModalityType.AUDIO: data.load_and_transform_audio_data(audio_paths, device), | |
| } | |
| with torch.no_grad(): | |
| embeddings = model(inputs) | |
| scores = ( | |
| torch.softmax( | |
| embeddings[ModalityType.AUDIO] @ embeddings[ModalityType.TEXT].T, dim=-1 | |
| ) | |
| .squeeze(0) | |
| .tolist() | |
| ) | |
| score_dict = {label: score for label, score in zip(labels, scores)} | |
| return score_dict | |
| def video_text_zeroshot(video, text_list): | |
| video_paths = [video] | |
| labels = [label.strip(" ") for label in text_list.strip(" ").split("|")] | |
| inputs = { | |
| ModalityType.TEXT: data.load_and_transform_text(labels, device), | |
| ModalityType.VISION: data.load_and_transform_video_data(video_paths, device), | |
| } | |
| with torch.no_grad(): | |
| embeddings = model(inputs) | |
| scores = ( | |
| torch.softmax( | |
| embeddings[ModalityType.VISION] @ embeddings[ModalityType.TEXT].T, dim=-1 | |
| ) | |
| .squeeze(0) | |
| .tolist() | |
| ) | |
| score_dict = {label: score for label, score in zip(labels, scores)} | |
| return score_dict | |
| def inference( | |
| task, | |
| text_list=None, | |
| image=None, | |
| audio=None, | |
| video=None, | |
| ): | |
| if task == "image-text": | |
| result = image_text_zeroshot(image, text_list) | |
| elif task == "audio-text": | |
| result = audio_text_zeroshot(audio, text_list) | |
| elif task == "video-text": | |
| result = video_text_zeroshot(video, text_list) | |
| else: | |
| raise NotImplementedError | |
| return result | |
| def main(): | |
| inputs = [ | |
| gr.inputs.Radio( | |
| choices=[ | |
| "image-text", | |
| "audio-text", | |
| "video-text", | |
| ], | |
| type="value", | |
| default="image-text", | |
| label="Task", | |
| ), | |
| gr.inputs.Textbox(lines=1, label="Candidate texts"), | |
| gr.inputs.Image(type="filepath", label="Input image"), | |
| gr.inputs.Audio(type="filepath", label="Input audio"), | |
| gr.inputs.Video(type=None, label="Input video"), | |
| ] | |
| iface = gr.Interface( | |
| inference, | |
| inputs, | |
| "label", | |
| examples=[ | |
| ["image-text", "A dog|A car|A bird", "assets/dog_image.jpg", None, None], | |
| ["image-text", "A dog|A car|A bird", "assets/car_image.jpg", None, None], | |
| ["audio-text", "A dog|A car|A bird", None, "assets/bird_audio.wav", None], | |
| ["video-text", "A dog|A car|A bird", None, None, "assets/dog_video.mp4"], | |
| ], | |
| description="""<p>This is a simple demo of ImageBind for zero-shot cross-modal understanding (now including image classification, audio classification, and video classification). Please refer to the original <a href='https://arxiv.org/abs/2305.05665' target='_blank'>paper</a> and <a href='https://github.com/facebookresearch/ImageBind' target='_blank'>repo</a> for more details.<br> | |
| To test your own cases, you can upload an image, an audio or a video, and provide the candidate texts separated by "|".<br> | |
| You can duplicate this space and run it privately: <a href='https://huggingface.co/spaces/OFA-Sys/chinese-clip-zero-shot-image-classification?duplicate=true'><img src='https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14' alt='Duplicate Space'></a></p>""", | |
| title="ImageBind: Zero-shot Cross-modal Understanding", | |
| ) | |
| iface.launch() | |
| if __name__ == "__main__": | |
| main() | |