Spaces:

lightonai
/

vision_pipeline

Running on Zero

App Files Files Community

Antoine Chaffin commited on Sep 24, 2024

Commit

349b5c2

1 Parent(s): d7e0b8c

Initial commit

Browse files

Files changed (4) hide show

app.py +106 -0
model.py +118 -0
requirements.txt +9 -0
voyager_index.py +221 -0

app.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import uuid
+import gradio as gr
+import torch
+from qwen_vl_utils import process_vision_info
+from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
+from voyager_index import Voyager
+device = "cuda" if torch.cuda.is_available() else "cpu"
+device = "cpu"
+# Initialize the model and processor
+model = (
+    Qwen2VLForConditionalGeneration.from_pretrained(
+        "Qwen/Qwen2-VL-2B-Instruct", trust_remote_code=True, torch_dtype=torch.bfloat16
+    )
+    .to(device)
+    .eval()
+)
+processor = AutoProcessor.from_pretrained(
+    "Qwen/Qwen2-VL-2B-Instruct", trust_remote_code=True
+)
+def create_index(session_id):
+    return Voyager(embedding_size=1536, override=True, index_name=f"{session_id}")
+def add_to_index(files, index):
+    index.add_documents([file.name for file in files], batch_size=1)
+    return f"Added {len(files)} files to the index."
+def query_index(query, index):
+    res = index(query, k=1)
+    retrieved_image = res["documents"][0][0]["image"]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": retrieved_image,
+                },
+                {"type": "text", "text": query},
+            ],
+        }
+    ]
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    inputs = inputs.to(device)
+    generated_ids = model.generate(**inputs, max_new_tokens=200)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids) :]
+        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False,
+    )
+    return output_text[0], retrieved_image
+# Define the Gradio interface
+with gr.Blocks() as demo:
+    session_id = gr.State(lambda: str(uuid.uuid4()))
+    index = gr.State(lambda: create_index(session_id.value))
+    gr.Markdown("# Full vision pipeline demo")
+    with gr.Tab("Add to Index"):
+        file_input = gr.File(file_count="multiple", label="Upload Files")
+        add_button = gr.Button("Add to Index")
+        add_output = gr.Textbox(label="Result")
+        add_button.click(add_to_index, inputs=[file_input, index], outputs=add_output)
+    with gr.Tab("Query Index"):
+        query_input = gr.Textbox(label="Enter your query")
+        query_button = gr.Button("Submit Query")
+        with gr.Row():
+            query_output = gr.Textbox(label="Answer")
+            image_output = gr.Image(label="Retrieved Image")
+        query_button.click(
+            query_index,
+            inputs=[query_input, index],
+            outputs=[query_output, image_output],
+        )
+# Launch the interface
+demo.launch()

model.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import torch
+from PIL import Image
+from qwen_vl_utils import process_vision_info
+from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# device = "cpu"
+min_pixels = 1 * 28 * 28
+max_pixels = 256 * 28 * 28  # 2560 * 28 * 28
+processor = AutoProcessor.from_pretrained(
+    "MrLight/dse-qwen2-2b-mrl-v1", min_pixels=min_pixels, max_pixels=max_pixels
+)
+model = (
+    Qwen2VLForConditionalGeneration.from_pretrained(
+        "MrLight/dse-qwen2-2b-mrl-v1",
+        # attn_implementation="eager",
+        attn_implementation="flash_attention_2"
+        if device == "cuda"
+        else "eager",  # flash_attn is required but is a pain to install on spaces
+        torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
+    )
+    .to(device)
+    .eval()
+)
+processor.tokenizer.padding_side = "left"
+model.padding_side = "left"
+def get_embedding(last_hidden_state: torch.Tensor, dimension: int):
+    reps = last_hidden_state[:, -1]
+    reps = torch.nn.functional.normalize(reps[:, :dimension], p=2, dim=-1)
+    return reps.to(torch.float32).cpu().numpy()
+def encode_queries(queries: list):
+    if isinstance(queries, str):
+        queries = [queries]
+    query_messages = []
+    for query in queries:
+        message = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": Image.new("RGB", (28, 28)),
+                        "resized_height": 1,
+                        "resized_width": 1,
+                    },  # need a dummy image here for an easier process.
+                    {"type": "text", "text": f"Query: {query}"},
+                ],
+            }
+        ]
+        query_messages.append(message)
+    query_texts = [
+        processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
+        + "<|endoftext|>"
+        for msg in query_messages
+    ]
+    query_image_inputs, query_video_inputs = process_vision_info(query_messages)
+    query_inputs = processor(
+        text=query_texts,
+        images=query_image_inputs,
+        videos=query_video_inputs,
+        padding="longest",
+        return_tensors="pt",
+    ).to(device)
+    query_inputs = model.prepare_inputs_for_generation(**query_inputs, use_cache=False)
+    with torch.no_grad():
+        output = model(**query_inputs, return_dict=True, output_hidden_states=True)
+        query_embeddings = get_embedding(
+            output.hidden_states[-1], 1536
+        )  # adjust dimensionality for efficiency trade-off, e.g. 512
+    return query_embeddings
+def encode_images(images: list):
+    if isinstance(images, Image.Image):
+        images = [images]
+    doc_messages = []
+    for image in images:
+        message = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": image,
+                    },  #'resized_height':680 , 'resized_width':680} # adjust the image size for efficiency trade-off
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            }
+        ]
+        doc_messages.append(message)
+    doc_texts = [
+        processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
+        + "<|endoftext|>"
+        for msg in doc_messages
+    ]
+    doc_image_inputs, doc_video_inputs = process_vision_info(doc_messages)
+    doc_inputs = processor(
+        text=doc_texts,
+        images=doc_image_inputs,
+        videos=doc_video_inputs,
+        padding="longest",
+        return_tensors="pt",
+    ).to(device)
+    doc_inputs = model.prepare_inputs_for_generation(**doc_inputs, use_cache=False)
+    output = model(**doc_inputs, return_dict=True, output_hidden_states=True)
+    with torch.no_grad():
+        output = model(**doc_inputs, return_dict=True, output_hidden_states=True)
+    doc_embeddings = get_embedding(
+        output.hidden_states[-1], 1536
+    )  # adjust dimensionality for efficiency trade-off e.g. 512
+    return doc_embeddings

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torch
+torchvision
+git+https://github.com/huggingface/transformers.git@refs/pull/33654/head#egg=transformers #git+https://github.com/huggingface/transformers #transformers
+qwen-vl-utils
+gradio
+pypdfium2
+# flash_attn # https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu118torch1.12cxx11abiFALSE-cp310-cp310-linux_x86_64.whl #flash_attn
+sqlitedict
+voyager

voyager_index.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import os
+import numpy as np
+import pypdfium2 as pdfium
+import torch
+import tqdm
+from model import encode_images, encode_queries
+from PIL import Image
+from sqlitedict import SqliteDict
+from voyager import Index, Space
+def iter_batch(
+    X: list[str], batch_size: int, tqdm_bar: bool = True, desc: str = ""
+) -> list:
+    """Iterate over a list of elements by batch."""
+    batchs = [X[pos : pos + batch_size] for pos in range(0, len(X), batch_size)]
+    if tqdm_bar:
+        for batch in tqdm.tqdm(
+            iterable=batchs,
+            position=0,
+            total=1 + len(X) // batch_size,
+            desc=desc,
+        ):
+            yield batch
+    else:
+        yield from batchs
+class Voyager:
+    """Voyager index. The Voyager index is a fast and efficient index for approximate nearest neighbor search.
+    Parameters
+    ----------
+    name
+        The name of the collection.
+    override
+        Whether to override the collection if it already exists.
+    embedding_size
+        The number of dimensions of the embeddings.
+    M
+        The number of subquantizers.
+    ef_construction
+        The number of candidates to evaluate during the construction of the index.
+    ef_search
+        The number of candidates to evaluate during the search.
+    """
+    def __init__(
+        self,
+        index_folder: str = "indexes",
+        index_name: str = "base_collection",
+        override: bool = False,
+        embedding_size: int = 128,
+        M: int = 64,
+        ef_construction: int = 200,
+        ef_search: int = 200,
+    ) -> None:
+        self.ef_search = ef_search
+        if not os.path.exists(path=index_folder):
+            os.makedirs(name=index_folder)
+        self.index_path = os.path.join(index_folder, f"{index_name}.voyager")
+        self.page_ids_to_data_path = os.path.join(
+            index_folder, f"{index_name}_page_ids_to_data.sqlite"
+        )
+        self.index = self._create_collection(
+            index_path=self.index_path,
+            embedding_size=embedding_size,
+            M=M,
+            ef_constructions=ef_construction,
+            override=override,
+        )
+    def _load_page_ids_to_data(self) -> SqliteDict:
+        """Load the SQLite database that maps document IDs to images."""
+        return SqliteDict(self.page_ids_to_data_path, outer_stack=False)
+    def _create_collection(
+        self,
+        index_path: str,
+        embedding_size: int,
+        M: int,
+        ef_constructions: int,
+        override: bool,
+    ) -> None:
+        """Create a new Voyager collection.
+        Parameters
+        ----------
+        index_path
+            The path to the index.
+        embedding_size
+            The size of the embeddings.
+        M
+            The number of subquantizers.
+        ef_constructions
+            The number of candidates to evaluate during the construction of the index.
+        override
+            Whether to override the collection if it already exists.
+        """
+        if os.path.exists(path=index_path) and not override:
+            return Index.load(index_path)
+        if os.path.exists(path=index_path):
+            os.remove(index_path)
+        # Create the Voyager index
+        index = Index(
+            Space.Cosine,
+            num_dimensions=embedding_size,
+            M=M,
+            ef_construction=ef_constructions,
+        )
+        index.save(index_path)
+        if override and os.path.exists(path=self.page_ids_to_data_path):
+            os.remove(path=self.page_ids_to_data_path)
+        # Create the SQLite databases
+        page_ids_to_data = self._load_page_ids_to_data()
+        page_ids_to_data.close()
+        return index
+    def add_documents(
+        self,
+        paths: str | list[str],
+        batch_size: int = 1,
+    ) -> None:
+        """Add documents to the index. Note that batch_size means the number of pages to encode at once, not documents."""
+        if isinstance(paths, str):
+            paths = [paths]
+        page_ids_to_data = self._load_page_ids_to_data()
+        images = []
+        num_pages = []
+        for path in paths:
+            if path.lower().endswith(".pdf"):
+                pdf = pdfium.PdfDocument(path)
+                n_pages = len(pdf)
+                num_pages.append(n_pages)
+                for page_number in range(n_pages):
+                    page = pdf.get_page(page_number)
+                    pil_image = page.render(
+                        scale=1,
+                        rotation=0,
+                    )
+                    pil_image = pil_image.to_pil()
+                    images.append(pil_image)
+                pdf.close()
+            else:
+                pil_image = Image.open(path)
+                images.append(pil_image)
+                num_pages.append(1)
+        embeddings = []
+        for batch in iter_batch(
+            X=images, batch_size=batch_size, desc=f"Encoding pages (bs={batch_size})"
+        ):
+            embeddings.extend(encode_images(batch))
+        embeddings_ids = self.index.add_items(embeddings)
+        current_index = 0
+        for i, path in enumerate(paths):
+            for page_number in range(num_pages[i]):
+                page_ids_to_data[embeddings_ids[current_index]] = {
+                    "path": path,
+                    "image": images[current_index],
+                    "page_number": page_number,
+                }
+                current_index += 1
+        page_ids_to_data.commit()
+        self.index.save(self.index_path)
+        return self
+    def __call__(
+        self,
+        queries: np.ndarray | torch.Tensor,
+        k: int = 10,
+    ) -> dict:
+        """Query the index for the nearest neighbors of the queries embeddings.
+        Parameters
+        ----------
+        queries_embeddings
+            The queries embeddings.
+        k
+            The number of nearest neighbors to return.
+        """
+        queries_embeddings = encode_queries(queries)
+        page_ids_to_data = self._load_page_ids_to_data()
+        k = min(k, len(page_ids_to_data))
+        n_queries = len(queries_embeddings)
+        indices, distances = self.index.query(
+            queries_embeddings, k, query_ef=self.ef_search
+        )
+        if len(indices) == 0:
+            raise ValueError("Index is empty, add documents before querying.")
+        documents = [
+            [page_ids_to_data[str(indice)] for indice in query_indices]
+            for query_indices in indices
+        ]
+        page_ids_to_data.close()
+        return {
+            "documents": documents,
+            "distances": distances.reshape(n_queries, -1, k),
+        }