import subprocess import sys print("--- STARTING INSTALLATION ---") try: # 1. Install llama-cpp-python subprocess.run( [sys.executable, "-m", "pip", "install", "-U", "llama-cpp-python==0.3.16"], check=True, ) # 2. Install wikipedia (Second command - FORCES install) print("--- INSTALLING WIKIPEDIA ---") subprocess.run( [sys.executable, "-m", "pip", "install", "wikipedia"], check=True, ) print("--- WIKIPEDIA INSTALLED ---") except Exception as e: print(f"--- INSTALLATION FAILED: {e} ---") import gradio as gr import wikipedia from agent import respond, generate_chat, add_history from llama_cpp import Llama # ---------------- CONFIG ---------------- BASE_REPO_ID = "unsloth/Llama-3.2-3B-Instruct-GGUF" BASE_FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf" FT_REPO_ID = "JoarP/Llama-3.2-3B-FineTome5K-gguf" FT_FILENAME = "v1" FT_REPO_ID_2 = "JoarP/Llama-3.2-3B-Finetuning" FT_FILENAME_2 = "FuncCall-Synthetic-Small" N_CTX = 2048 N_THREADS = 2 # ------------- LOAD MODELS ON CPU -------------- print(f"Loading model: {FT_REPO_ID}/{FT_FILENAME}") try: llm_ft = Llama.from_pretrained( repo_id=FT_REPO_ID, filename=FT_FILENAME, n_ctx=N_CTX, n_threads=N_THREADS, ) AVAILABLE_MODELS = {"FineTome FT - Llama 3.2 3B": llm_ft} print("Loading base model...") llm_base = Llama.from_pretrained( repo_id=BASE_REPO_ID, filename=BASE_FILENAME, n_ctx=N_CTX, n_threads=N_THREADS, ) AVAILABLE_MODELS["Llama 3.2 3B"] = llm_base print("Loading model...") llm_ft_2 = Llama.from_pretrained( repo_id=FT_REPO_ID_2, filename=FT_FILENAME_2, n_ctx=N_CTX, n_threads=N_THREADS, ) AVAILABLE_MODELS["FuncCall FT - Llama 3.2 3B"] = llm_ft_2 except Exception as e: print(f"Error loading model: {e}") raise e #--------------- TO RUN LOCALLY ----------------- # llm = Llama( # model_path="works2.gguf", # n_ctx=N_CTX, # n_threads=None, # ) # AVAILABLE_MODELS = { # "Finetuned Llama 3.2 3B": llm, # } # ------------- FAST RESPONSE WITHOUT AGENT -------------- def respond_fast(user_message, history, model_choice): """ Fast path: no tools, no agent. Just a single LLM call with the given system message and chat history. """ # Pick model from dropdown llm = AVAILABLE_MODELS.get(model_choice) if llm is None: llm = next(iter(AVAILABLE_MODELS.values())) history = add_history(user_message, history, "You are a helpful assistant. Just chat with the user.") # Single streaming generation stream = generate_chat(llm, history, max_tokens=256, temperature=0.2, top_p=0.95) for out in stream: yield out # ------------- TOOLS DEFINITIONS -------------- from weather import get_current_weather, get_current_temperature def multiply(a: int, b: int) -> int: """Multiplies two integers. Args: a, b.""" return a * b def search_wikipedia(query: str) -> str: """ Searches Wikipedia for a topic and returns a summary. Args: query (the topic to search for). """ try: # We limit to 2 sentences to keep the LLM context small return wikipedia.summary(query, sentences=2) except wikipedia.exceptions.DisambiguationError as e: return f"Search ambiguous. Options: {e.options[:5]}" except wikipedia.exceptions.PageError: return "No Wikipedia page found for that query." except Exception as e: return f"Error searching Wikipedia: {str(e)}" # ------------- AGENT CONFIGURATIONS -------------- AGENTS = { "Weather": { "system_message": "You are a helpful weather assistant", "tools": [get_current_weather, get_current_temperature] }, "Math": { "system_message": "You are a helpful math assistant", "tools": [multiply] }, "Researcher": { "system_message": "You are a research assistant. If the user asks a factual question, use the search_wikipedia tool.", "tools": [search_wikipedia] }, } # ------------- WRAPPER FUNCTION FOR AGENTS ---------------- def app_respond(message, history, model_choice, agent_choice): """Pass the selected agent to our agentic framwork""" llm = AVAILABLE_MODELS.get(model_choice) if llm is None: llm = next(iter(AVAILABLE_MODELS.values())) agent_config = AGENTS.get(agent_choice) for chunk in respond( message, history, system_message=agent_config["system_message"], llm=llm, tools=agent_config["tools"] ): yield chunk # ------------- GRADIO UI ---------------- with gr.Blocks() as demo: gr.Markdown( "# Finetuned Llama 3.2 3B (CPU, GGUF) in an Agentic Framework\n" "Switch between a general assistant, a live weather assistant, and a math assistant." ) with gr.Tabs(): # -------- TAB 1: GENERAL LLM ASSISTANT -------- with gr.Tab("💬 General Assistant"): with gr.Row(): with gr.Column(scale=1): gr.Markdown( "### General Assistant\n" "Chat with the base or fine-tuned model. Use this mode for any kind of question.\n" "The general assistant has the fastest response time." ) model_dropdown = gr.Dropdown( label="Model", choices=list(AVAILABLE_MODELS.keys()), value=list(AVAILABLE_MODELS.keys())[0], interactive=True, ) with gr.Column(scale=3, elem_id="general-chat"): general_chatbot = gr.ChatInterface( fn=respond_fast, additional_inputs=[ model_dropdown, ], ) # -------- TAB 2: LIVE WEATHER ASSISTANT -------- with gr.Tab("☀️ LIVE Weather Assistant"): with gr.Row(): with gr.Column(scale=1): gr.Markdown("### Live Weather Assistant\n" "Fetches up to date weather data" ) model_dropdown = gr.Dropdown( label="Model", choices=list(AVAILABLE_MODELS.keys()), value=list(AVAILABLE_MODELS.keys())[0], interactive=True, ) with gr.Column(scale=3, elem_id="weather-chat"): # 🌤️ ASSISTANT HEADER (name + avatar + tagline) gr.HTML( """