Spaces:
Running
Running
| import gradio as gr | |
| import weaviate | |
| from weaviate.auth import Auth | |
| from sentence_transformers import SentenceTransformer | |
| from weaviate.classes.query import MetadataQuery | |
| from weaviate.collections.classes.filters import Filter | |
| from typing import List, Dict, Any | |
| import os | |
| from dotenv import load_dotenv | |
| import pandas as pd | |
| import re | |
| from functools import lru_cache | |
| # Load environment variables | |
| load_dotenv() | |
| # Validate environment variables | |
| WEAVIATE_URL = os.getenv("WEAVIATE_URL") | |
| WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY") | |
| COLLECTION_NAME = os.getenv("COLLECTION_NAME") | |
| if not all([WEAVIATE_URL, WEAVIATE_API_KEY, COLLECTION_NAME]): | |
| raise ValueError( | |
| "Missing required environment variables. Please ensure the following are set:\n" | |
| "WEAVIATE_URL\n" | |
| "WEAVIATE_API_KEY\n" | |
| "COLLECTION_NAME" | |
| ) | |
| # Initialize the model | |
| model = SentenceTransformer('sentence-transformers/LaBSE') | |
| # Book mappings | |
| VULGATE_BOOKS = { | |
| "Genesis": "Gn", "Exodus": "Ex", "Leviticus": "Lv", "Numbers": "Nm", | |
| "Deuteronomy": "Dt", "Joshua": "Jos", "Judges": "Jdc", "Ruth": "Rt", | |
| "1 Samuel": "1Rg", "2 Samuel": "2Rg", "1 Kings": "3Rg", "2 Kings": "4Rg", | |
| "1 Chronicles": "1Par", "2 Chronicles": "2Par", "Ezra": "Esr", | |
| "Nehemiah": "Neh", "Tobit": "Tob", "Judith": "Jdt", "Esther": "Est", | |
| "1 Maccabees": "1Mcc", "2 Maccabees": "2Mcc", "Job": "Job", "Psalms": "Ps", | |
| "Proverbs": "Pr", "Ecclesiastes": "Ecl", "Song of Solomon": "Ct", | |
| "Wisdom": "Sap", "Sirach": "Sir", "Isaiah": "Is", "Jeremiah": "Jr", | |
| "Lamentations": "Lam", "Baruch": "Bar", "Ezekiel": "Ez", "Daniel": "Dn", | |
| "Hosea": "Os", "Joel": "Joel", "Amos": "Am", "Obadiah": "Abd", | |
| "Jonah": "Jon", "Micah": "Mch", "Nahum": "Nah", "Habakkuk": "Hab", | |
| "Zephaniah": "Soph", "Haggai": "Agg", "Zechariah": "Zach", | |
| "Malachi": "Mal", "Matthew": "Mt", "Mark": "Mc", "Luke": "Lc", | |
| "John": "Jo", "Acts": "Act", "Romans": "Rom", "1 Corinthians": "1Cor", | |
| "2 Corinthians": "2Cor", "Galatians": "Gal", "Ephesians": "Eph", | |
| "Philippians": "Phlp", "Colossians": "Col", "1 Thessalonians": "1Thes", | |
| "2 Thessalonians": "2Thes", "1 Timothy": "1Tim", "2 Timothy": "2Tim", | |
| "Titus": "Tit", "Philemon": "Phlm", "Hebrews": "Hbr", "James": "Jac", | |
| "1 Peter": "1Ptr", "2 Peter": "2Ptr", "1 John": "1Jo", "2 John": "2Jo", | |
| "3 John": "3Jo", "Jude": "Jud", "Revelation": "Apc" | |
| } | |
| def load_vulgate_csv(): | |
| df = pd.read_csv("data/clem_vulgate.csv") | |
| # Expect columns: book, chapter, verse, text | |
| return df | |
| def highlight_matching_words(text: str, query: str) -> str: | |
| if not query.strip(): | |
| return text | |
| query_words = set(re.findall(r'\b\w+\b', query.lower())) | |
| if not query_words: | |
| return text | |
| partial_pattern = re.compile(r'(' + '|'.join(re.escape(w) for w in query_words) + r')', re.IGNORECASE) | |
| tokens = re.findall(r'\w+|\W+', text) | |
| highlighted = [] | |
| for token in tokens: | |
| token_lc = token.lower() | |
| if token_lc in query_words: | |
| highlighted.append(f'<b>{token}</b>') | |
| elif token.strip() and token.isalpha() and any(w in token_lc and w != token_lc for w in query_words): | |
| def bold_sub(m): | |
| return f'<em>{m.group(0)}</em>' | |
| highlighted.append(partial_pattern.sub(bold_sub, token)) | |
| else: | |
| highlighted.append(token) | |
| return ''.join(highlighted) | |
| def find_similar(query: str, books: List[str], limit: int = 50, search_method: str = "vector") -> List[Dict[str, Any]]: | |
| try: | |
| client = weaviate.connect_to_weaviate_cloud( | |
| cluster_url=WEAVIATE_URL, | |
| auth_credentials=Auth.api_key(WEAVIATE_API_KEY), | |
| ) | |
| try: | |
| vulgate = client.collections.get(COLLECTION_NAME) | |
| filter_condition = None | |
| if books: | |
| selected_books = [VULGATE_BOOKS[book] for book in books] | |
| filter_condition = Filter.by_property("book").contains_any(selected_books) | |
| # Always encode the query vector since we need it for both vector and hybrid search | |
| query_vector = model.encode([query])[0] | |
| if search_method == "vector": | |
| response = vulgate.query.near_vector( | |
| near_vector=query_vector, | |
| limit=limit, | |
| return_metadata=MetadataQuery(distance=True), | |
| filters=filter_condition | |
| ) | |
| elif search_method == "bm25": | |
| response = vulgate.query.bm25( | |
| query=query, | |
| limit=limit, | |
| filters=filter_condition | |
| ) | |
| else: # hybrid | |
| response = vulgate.query.hybrid( | |
| query=query, | |
| vector=query_vector, | |
| limit=limit, | |
| filters=filter_condition | |
| ) | |
| results = [] | |
| for obj in response.objects: | |
| highlighted_text = highlight_matching_words(obj.properties["text"], query) | |
| # Handle different types of scores | |
| similarity = 1.0 # default value | |
| if hasattr(obj.metadata, 'distance') and obj.metadata.distance is not None: | |
| similarity = 1 - obj.metadata.distance | |
| elif hasattr(obj.metadata, 'score') and obj.metadata.score is not None: | |
| similarity = obj.metadata.score | |
| results.append({ | |
| "Reference": f"{obj.properties['book']} {obj.properties['chapter']}:{obj.properties['verse']}", | |
| "Book": obj.properties["book"], | |
| "Chapter": obj.properties["chapter"], | |
| "Verse": obj.properties["verse"], | |
| "Text": highlighted_text, | |
| "RawText": obj.properties["text"], | |
| "Similarity": round(similarity, 3) | |
| }) | |
| return results | |
| finally: | |
| client.close() | |
| except Exception as e: | |
| return [{"Error": str(e)}] | |
| def format_results_html(results: List[Dict[str, Any]]) -> str: | |
| if not results: | |
| return "<div>No results found.</div>" | |
| if "Error" in results[0]: | |
| return f'<div style="color:red">Error: {results[0]["Error"]}</div>' | |
| html = [ | |
| '<table border="1">', | |
| '<thead><tr>' | |
| '<th>Reference</th><th>Text</th><th>Similarity</th><th>Book</th><th>Chapter</th><th>Verse</th>' | |
| '</tr></thead><tbody>' | |
| ] | |
| for r in results: | |
| html.append(f'<tr>' | |
| f'<td>{r["Reference"]}</td>' | |
| f'<td>{r["Text"]}</td>' | |
| f'<td>{r["Similarity"]}</td>' | |
| f'<td>{r["Book"]}</td>' | |
| f'<td>{r["Chapter"]}</td>' | |
| f'<td>{r["Verse"]}</td>' | |
| f'</tr>') | |
| html.append('</tbody></table>') | |
| return ''.join(html) | |
| def search(query: str, books: List[str], limit: int, search_method: str) -> str: | |
| if not query.strip(): | |
| return "<div>Please enter a search query.</div>" | |
| results = find_similar(query, books, limit, search_method) | |
| return format_results_html(results) | |
| with gr.Blocks(title="Latin Vulgate Verse Similarity Search", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # Latin Vulgate Verse Similarity Search | |
| Search for similar verses in the Latin Vulgate Bible using semantic similarity. | |
| <br>Words matching your query will be highlighted (exact matches and partial matches). | |
| """) | |
| with gr.Row(): | |
| query = gr.Textbox( | |
| label="Search Query", | |
| placeholder="Enter your search query...", | |
| lines=2, | |
| scale=3 | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| book_select = gr.Dropdown( | |
| choices=list(VULGATE_BOOKS.keys()), | |
| label="Select Books (Optional)", | |
| multiselect=True | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| search_method = gr.Radio( | |
| choices=["vector", "bm25", "hybrid"], | |
| label="Search Method", | |
| value="vector" | |
| ) | |
| with gr.Column(scale=1): | |
| limit = gr.Slider( | |
| minimum=1, | |
| maximum=50, | |
| value=20, | |
| step=1, | |
| label="Number of results" | |
| ) | |
| with gr.Row(): | |
| search_btn = gr.Button("Search", variant="primary") | |
| output = gr.HTML(label="Results") | |
| search_btn.click( | |
| fn=search, | |
| inputs=[query, book_select, limit, search_method], | |
| outputs=output, | |
| api_name="predict" | |
| ) | |
| query.submit( | |
| fn=search, | |
| inputs=[query, book_select, limit, search_method], | |
| outputs=output, | |
| api_name=False # Disable API for submit to avoid conflicts | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch( | |
| show_api=True, | |
| share=False | |
| ) | |