Spaces:

tecuts
/

chat

Running

App Files Files Community

tecuts commited on Jul 1

Commit

846f4fb

verified ·

1 Parent(s): d8153f4

Update app.py

Browse files

Files changed (1) hide show

app.py +284 -298

app.py CHANGED Viewed

@@ -1,13 +1,12 @@
 import os
 import json
 import requests
-import httpx
 from datetime import datetime
 from typing import List, Dict, Optional
-from fastapi import FastAPI, Request, HTTPException, Depends, status
-from fastapi.responses import StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
-from openai import AsyncOpenAI
 from openai import OpenAI
 import logging
@@ -48,150 +47,101 @@ GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
 GOOGLE_CX = os.getenv("GOOGLE_CX")
 LLM_API_KEY = os.getenv("LLM_API_KEY")
 LLM_BASE_URL = os.getenv("LLM_BASE_URL", "https://api-15i2e8ze256bvfn6.aistudio-app.com/v1")
-MODEL_NAME = os.getenv("MODEL_NAME", "unsloth/Qwen3-30B-A3B-GGUF")
-# --- Enhanced System Prompt ---
 SYSTEM_PROMPT_WITH_SEARCH = """You are an intelligent AI assistant with access to real-time web search capabilities.
-IMPORTANT: When search tools are available to you, you should USE them for any query that could benefit from current, recent, or specific factual information, even if you have some knowledge about the topic from your training data.
-**When to use search (be proactive about using search when available):**
-- Any mention of recent events, current affairs, or "latest" information
-- Specific facts that could have changed since your training
-- Statistics, prices, scores, or numerical data
-- News, announcements, or current status of anything
-- When the user explicitly asks for current information
-- Any factual query where fresh information would be valuable
 **Response Guidelines:**
-1. Use search tools when they're available and relevant to the query
-2. Synthesize information from multiple sources when possible
 3. Clearly indicate when information comes from search results
 4. Provide comprehensive, well-structured answers
 5. Cite sources appropriately
-6. If search results are contradictory, mention the discrepancy
-**Current Context**: Today's date is {current_date}. Prioritize recent information when available.
-Remember: When in doubt about whether to search, lean towards using the search tool for more accurate and current information."""
-SYSTEM_PROMPT_NO_SEARCH = """You are an intelligent AI assistant. Provide helpful, accurate, and comprehensive responses based on your training data.
-When you don't have current information about recent events or changing data, acknowledge this limitation and suggest that the user might want to search for the most up-to-date information.
-**Current Context**: Today's date is {current_date}, but your knowledge has a cutoff date and may not include the most recent information."""
-# --- Enhanced Web Search Tool Implementation ---
-def google_search_tool(queries: List[str], num_results: int = 5) -> List[Dict]:
     """
-    Enhanced Google Custom Search with better error handling and result formatting
     """
-    if not GOOGLE_API_KEY or not GOOGLE_CX:
-        logger.error("GOOGLE_API_KEY or GOOGLE_CX environment variables not set.")
-        return []
-    if not queries or not queries[0].strip():
-        logger.warning("Empty search query provided")
         return []
-    query = queries[0].strip()
-    logger.info(f"Executing Google Custom Search for: '{query}'")
     search_url = "https://www.googleapis.com/customsearch/v1"
     params = {
         "key": GOOGLE_API_KEY,
         "cx": GOOGLE_CX,
-        "q": query,
-        "num": min(num_results, 10),  # Google API max is 10
-        "dateRestrict": "m6"  # Prioritize results from last 6 months for freshness
     }
     try:
-        response = requests.get(search_url, params=params, timeout=15)
         response.raise_for_status()
         search_results = response.json()
         if "items" not in search_results:
-            logger.warning(f"No search results found for query: '{query}'")
             return []
-        # Enhanced result parsing with better data validation
         parsed_results = []
-        for item in search_results.get("items", []):
             title = item.get("title", "").strip()
             url = item.get("link", "").strip()
             snippet = item.get("snippet", "").strip()
-            # Skip results with missing essential information
-            if not title or not url or not snippet:
-                continue
-            # Extract publication date if available
-            pub_date = None
-            if "pagemap" in item and "metatags" in item["pagemap"]:
-                for meta in item["pagemap"]["metatags"]:
-                    if "article:published_time" in meta:
-                        pub_date = meta["article:published_time"]
-                        break
-            parsed_results.append({
-                "source_title": title,
-                "url": url,
-                "snippet": snippet,
-                "published_date": pub_date,
-                "domain": url.split('/')[2] if '/' in url else url
-            })
-        logger.info(f"Successfully parsed {len(parsed_results)} search results")
         return parsed_results
-    except requests.exceptions.Timeout:
-        logger.error("Google search request timed out")
-        return []
-    except requests.exceptions.RequestException as e:
-        logger.error(f"Error during Google search request: {e}")
-        return []
     except Exception as e:
-        logger.error(f"Unexpected error in google_search_tool: {e}")
         return []
-def format_search_results_for_llm(search_results: List[Dict]) -> str:
-    """
-    Format search results with enhanced context for better LLM understanding
-    """
     if not search_results:
-        return "No relevant search results were found for this query."
-    current_date = datetime.now().strftime("%Y-%m-%d")
-    formatted_results = [f"Search Results (Retrieved on {current_date}):\n"]
     for i, result in enumerate(search_results, 1):
-        formatted_result = f"\n--- Result {i} ---"
-        formatted_result += f"\nTitle: {result['source_title']}"
-        formatted_result += f"\nSource: {result['domain']}"
-        formatted_result += f"\nURL: {result['url']}"
-        if result.get('published_date'):
-            formatted_result += f"\nPublished: {result['published_date']}"
-        formatted_result += f"\nContent: {result['snippet']}"
-        formatted_results.append(formatted_result)
-    formatted_results.append(f"\n--- End of Search Results ---\n")
-    formatted_results.append("Please synthesize this information to provide a comprehensive answer to the user's question. If the search results contain conflicting information, please note the discrepancy. Always cite your sources when using information from the search results.")
-    return "\n".join(formatted_results)
 # --- FastAPI Application Setup ---
-app = FastAPI(title="AI Chatbot with Enhanced Search", version="2.0.0")
 app.add_middleware(
     CORSMiddleware,
     allow_origins=[
         "https://chrunos.com",
         "https://www.chrunos.com",
-        "http://localhost:3000",  # For local development
-        "http://localhost:8000",  # For local development
     ],
     allow_credentials=True,
     allow_methods=["GET", "POST", "OPTIONS"],
@@ -203,22 +153,22 @@ if not LLM_API_KEY or not LLM_BASE_URL:
     logger.error("LLM_API_KEY or LLM_BASE_URL not configured")
     client = None
 else:
-    client = AsyncOpenAI(api_key=LLM_API_KEY, base_url=LLM_BASE_URL)
     logger.info("OpenAI client initialized successfully")
-# --- Enhanced Tool Definition ---
 available_tools = [
     {
         "type": "function",
         "function": {
             "name": "google_search",
-            "description": "REQUIRED for current information: Performs a Google search for recent events, current data, latest news, statistics, prices, or any information that changes frequently. Use this tool proactively when the user's query could benefit from up-to-date information, even if you have some relevant knowledge from training data.",
             "parameters": {
                 "type": "object",
                 "properties": {
                     "query": {
                         "type": "string",
-                        "description": "The search query. Be specific and include relevant keywords. For recent events, include time-related terms like 'latest', '2024', 'recent', etc."
                     }
                 },
                 "required": ["query"]
@@ -227,246 +177,282 @@ available_tools = [
     }
 ]
-def should_use_search(message: str) -> bool:
-    """
-    Intelligent decision making for when to enable search based on message content
-    """
-    search_indicators = [
-        "latest", "recent", "current", "now", "today", "this year", "2024", "2025",
-        "news", "update", "what's happening", "status", "price", "stock",
-        "weather", "score", "results", "announcement", "release"
-    ]
-    factual_indicators = [
-        "who is", "what is", "where is", "when did", "how many", "statistics",
-        "data", "information about", "tell me about", "facts about"
-    ]
-    message_lower = message.lower()
-    # Strong indicators for search
-    if any(indicator in message_lower for indicator in search_indicators):
-        return True
-    # Moderate indicators for search (factual queries)
-    if any(indicator in message_lower for indicator in factual_indicators):
-        return True
-    return False
-# --- Enhanced Chatbot Endpoint ---
-@app.post("/chat")
-async def chat_endpoint(request: Request, _: None = Depends(verify_origin)):
     if not client:
         raise HTTPException(status_code=500, detail="LLM client not configured")
     try:
         data = await request.json()
         user_message = data.get("message", "").strip()
-        # Support both 'use_search' and 'user_search' parameter names for flexibility
-        use_search = data.get("use_search")
-        if use_search is None:
-            use_search = data.get("user_search")  # Alternative parameter name
-        # Allow client to specify temperature (with validation)
-        temperature = data.get("temperature", 0.7)  # Default to 0.7
-        if not isinstance(temperature, (int, float)) or temperature < 0 or temperature > 2:
-            logger.warning(f"Invalid temperature value: {temperature}, defaulting to 0.7")
-            temperature = 0.7
-        conversation_history = data.get("history", [])
-        # Debug logging for request parameters
-        logger.info(f"Request parameters - message length: {len(user_message)}, use_search: {use_search}, temperature: {temperature}, history length: {len(conversation_history)}")
         if not user_message:
             raise HTTPException(status_code=400, detail="No message provided")
-        # Auto-decide search usage if not specified
-        '''if use_search is None:
-            use_search = should_use_search(user_message)
-            logger.info(f"Auto-decided search usage: {use_search}")
-        else:
-            logger.info(f"Manual search setting: {use_search}")'''
-        # Prepare messages with appropriate system prompt based on search availability
         current_date = datetime.now().strftime("%Y-%m-%d")
-        if use_search:
-            system_content = SYSTEM_PROMPT_WITH_SEARCH.format(current_date=current_date)
-        else:
-            system_content = SYSTEM_PROMPT_NO_SEARCH.format(current_date=current_date)
-        system_message = {"role": "system", "content": system_content}
-        messages = [system_message] + conversation_history + [{"role": "user", "content": user_message}]
-        llm_kwargs = {
-            "model": MODEL_NAME,
-            "temperature": temperature,  # Use client-specified temperature
-            "messages": messages,
-            "max_tokens": 2000  # Ensure comprehensive responses
-        }
         if use_search:
-            logger.info("Search is ENABLED - tools will be available to the model")
-            llm_kwargs["tools"] = available_tools
-            llm_kwargs["tool_choice"] = "required"  # Consider using "required" for testing
-        else:
-            logger.info("Search is DISABLED - no tools available")
-        # First LLM call
-        logger.info(f"Making LLM request with tools: {bool(use_search)}, temperature: {temperature}")
-        llm_response = await client.chat.completions.create(**llm_kwargs)
-        tool_calls = llm_response.choices[0].message.tool_calls
-        source_links = []
-        # Debug: Log tool call information
-        if tool_calls:
-            logger.info(f"LLM made {len(tool_calls)} tool calls")
-            for i, call in enumerate(tool_calls):
-                logger.info(f"Tool call {i+1}: {call.function.name} with args: {call.function.arguments}")
-        else:
-            logger.info("LLM did not make any tool calls")
-            if use_search:
-                logger.warning("Search was enabled but LLM chose not to use search tools - this might indicate the query doesn't require current information")
-        if tool_calls:
-            logger.info(f"Processing {len(tool_calls)} tool calls")
-            tool_outputs = []
-            for tool_call in tool_calls:
-                if tool_call.function.name == "google_search":
-                    try:
-                        function_args = json.loads(tool_call.function.arguments)
-                        search_query = function_args.get("query", "").strip()
-                        if search_query:
-                            logger.info(f"Executing search for: {search_query}")
-                            search_results = google_search_tool([search_query], num_results=5)
-                            # Collect source links for response
-                            for result in search_results:
                                 source_links.append({
                                     "title": result["source_title"],
                                     "url": result["url"],
                                     "domain": result["domain"]
                                 })
-                            # Format results for LLM
-                            formatted_results = format_search_results_for_llm(search_results)
-                            tool_outputs.append({
-                                "tool_call_id": tool_call.id,
-                                "output": formatted_results
-                            })
-                        else:
-                            logger.warning("Empty search query in tool call")
-                            tool_outputs.append({
-                                "tool_call_id": tool_call.id,
-                                "output": "Error: Empty search query provided."
-                            })
-                    except json.JSONDecodeError as e:
-                        logger.error(f"Failed to parse tool call arguments: {e}")
-                        tool_outputs.append({
-                            "tool_call_id": tool_call.id,
-                            "output": "Error: Failed to parse search parameters."
-                        })
-            # Continue conversation with search results
-            messages.append(llm_response.choices[0].message)
-            for output_item in tool_outputs:
-                messages.append({
-                    "role": "tool",
-                    "tool_call_id": output_item["tool_call_id"],
-                    "content": output_item["output"]
-                })
-            # Final response generation with search context
-        # Enhanced response structure
-        logger.info(f"Chat response generated successfully. Search used: {bool(tool_calls)}, Temperature: {temperature}")
-        async def stream_response():
-            if not client:
-                yield f"data: {json.dumps({'chunk': 'Error: LLM client not configured', 'sources': [], 'search_used': bool(tool_calls), 'temperature': temperature, 'timestamp': datetime.now().isoformat()})}\n\n"
-                yield "data: [DONE]\n\n"
-                return
-            # 使用httpx手动处理流式响应
-            headers = {
-                "Content-Type": "application/json",
-                "Authorization": f"Bearer {LLM_API_KEY}"
-            }
-            payload = {
-                "model": MODEL_NAME,
-                "messages": messages,
-                "temperature": temperature,
-                "stream": True
-            }
-            async with httpx.AsyncClient() as http_client:
-                try:
-                    response = await http_client.post(f"{LLM_BASE_URL}/chat/completions", headers=headers, json=payload)
-                    response.raise_for_status()  # Raise HTTP errors
-                    async for line in response.aiter_lines():
-                        if line.startswith("data: "):
-                            data = line[len("data: "):]
-                            if data.strip() == "[DONE]":
-                                yield "data: [DONE]\n\n"
-                                break
-                            try:
-                                chunk = json.loads(data)
-                                content = chunk.get("choices", [{}])[0].get("delta", {}).get("content", "")
-                                if content:
-                                    yield f"data: {json.dumps({'chunk': content, 'sources': source_links, 'search_used': bool(tool_calls), 'temperature': temperature, 'timestamp': datetime.now().isoformat()})}\n\n"
-                            except json.JSONDecodeError:
-                                logger.warning(f"Failed to decode JSON chunk: {data}")
-                                continue
-                except httpx.HTTPError as e:
-                    logger.error(f"HTTP request failed: {str(e)}")
-                    yield f"data: {json.dumps({'error': 'Failed to generate response', 'details': str(e)})}\n\n"
-                    yield "data: [DONE]\n\n"
-                    return
-                except Exception as e:
-                    logger.error(f"Unexpected error in streaming response: {str(e)}")
-                    yield f"data: {json.dumps({'error': 'An unexpected error occurred', 'details': str(e)})}\n\n"
-                    yield "data: [DONE]\n\n"
-                    return
-            # 发送结束信号
-            yield "data: [DONE]\n\n"
-        return StreamingResponse(stream_response(), media_type="text/event-stream")
-    except HTTPException:
-        raise
-    except json.JSONDecodeError:
-        logger.error("Invalid JSON in request body")
-        raise HTTPException(status_code=400, detail="Invalid JSON in request body")
     except Exception as e:
-        logger.error(f"Unexpected error in /chat endpoint: {e}")
-        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
-# --- Health Check Endpoint ---
 @app.get("/")
 async def root():
     return {
-        "message": "Enhanced AI Chatbot API is running",
-        "version": "2.0.0",
-        "features": ["Google Search Integration", "Intelligent Search Decision", "Enhanced Prompting"],
         "timestamp": datetime.now().isoformat()
     }
-# --- Health Check Endpoint ---
 @app.get("/health")
 async def health_check():
-    health_status = {
         "status": "healthy",
         "timestamp": datetime.now().isoformat(),
         "services": {
             "llm_client": client is not None,
             "google_search": bool(GOOGLE_API_KEY and GOOGLE_CX)
         }
-    }
-    return health_status

 import os
 import json
+import asyncio
 import requests
 from datetime import datetime
 from typing import List, Dict, Optional
+from fastapi import FastAPI, Request, HTTPException, Depends
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse
 from openai import OpenAI
 import logging
 GOOGLE_CX = os.getenv("GOOGLE_CX")
 LLM_API_KEY = os.getenv("LLM_API_KEY")
 LLM_BASE_URL = os.getenv("LLM_BASE_URL", "https://api-15i2e8ze256bvfn6.aistudio-app.com/v1")
+# --- Simplified System Prompts ---
 SYSTEM_PROMPT_WITH_SEARCH = """You are an intelligent AI assistant with access to real-time web search capabilities.
+When search tools are available, use them for queries that need current, recent, or specific factual information.
 **Response Guidelines:**
+1. Use search tools when available and relevant
+2. Synthesize information from multiple sources
 3. Clearly indicate when information comes from search results
 4. Provide comprehensive, well-structured answers
 5. Cite sources appropriately
+Current date: {current_date}"""
+SYSTEM_PROMPT_NO_SEARCH = """You are an intelligent AI assistant. Provide helpful, accurate, and comprehensive responses based on your training data.
+Current date: {current_date}"""
+# --- Optimized Web Search Tool ---
+async def google_search_tool_async(query: str, num_results: int = 3) -> List[Dict]:
     """
+    Async Google Custom Search - reduced results for faster response
     """
+    if not GOOGLE_API_KEY or not GOOGLE_CX or not query.strip():
         return []
+    logger.info(f"Executing search for: '{query}'")
     search_url = "https://www.googleapis.com/customsearch/v1"
     params = {
         "key": GOOGLE_API_KEY,
         "cx": GOOGLE_CX,
+        "q": query.strip(),
+        "num": min(num_results, 5),  # Reduced for speed
+        "dateRestrict": "m3"  # Last 3 months for freshness
     }
     try:
+        # Run in thread pool to avoid blocking
+        loop = asyncio.get_event_loop()
+        response = await loop.run_in_executor(
+            None,
+            lambda: requests.get(search_url, params=params, timeout=10)
+        )
         response.raise_for_status()
         search_results = response.json()
         if "items" not in search_results:
             return []
         parsed_results = []
+        for item in search_results.get("items", [])[:num_results]:  # Limit results
             title = item.get("title", "").strip()
             url = item.get("link", "").strip()
             snippet = item.get("snippet", "").strip()
+            if title and url and snippet:
+                parsed_results.append({
+                    "source_title": title,
+                    "url": url,
+                    "snippet": snippet,
+                    "domain": url.split('/')[2] if '/' in url else url
+                })
+        logger.info(f"Retrieved {len(parsed_results)} search results")
         return parsed_results
     except Exception as e:
+        logger.error(f"Search error: {e}")
         return []
+def format_search_results_compact(search_results: List[Dict]) -> str:
+    """Compact formatting for faster processing"""
     if not search_results:
+        return "No search results found."
+    formatted = ["Search Results:"]
     for i, result in enumerate(search_results, 1):
+        formatted.append(f"\n{i}. {result['source_title']}")
+        formatted.append(f"   Source: {result['domain']}")
+        formatted.append(f"   Content: {result['snippet']}")
+    return "\n".join(formatted)
 # --- FastAPI Application Setup ---
+app = FastAPI(title="Streaming AI Chatbot", version="2.1.0")
 app.add_middleware(
     CORSMiddleware,
     allow_origins=[
         "https://chrunos.com",
         "https://www.chrunos.com",
+        "http://localhost:3000",
+        "http://localhost:8000",
     ],
     allow_credentials=True,
     allow_methods=["GET", "POST", "OPTIONS"],
     logger.error("LLM_API_KEY or LLM_BASE_URL not configured")
     client = None
 else:
+    client = OpenAI(api_key=LLM_API_KEY, base_url=LLM_BASE_URL)
     logger.info("OpenAI client initialized successfully")
+# --- Tool Definition ---
 available_tools = [
     {
         "type": "function",
         "function": {
             "name": "google_search",
+            "description": "Search Google for current information, recent events, or specific facts.",
             "parameters": {
                 "type": "object",
                 "properties": {
                     "query": {
                         "type": "string",
+                        "description": "Search query with relevant keywords"
                     }
                 },
                 "required": ["query"]
     }
 ]
+# --- Streaming Response Generator ---
+async def generate_streaming_response(messages: List[Dict], use_search: bool, temperature: float):
+    """Generate streaming response with optional search"""
+    try:
+        # Initial LLM call with streaming
+        llm_kwargs = {
+            "model": "unsloth/Qwen3-30B-A3B-GGUF",
+            "temperature": temperature,
+            "messages": messages,
+            "max_tokens": 2000,
+            "stream": True
+        }
+        if use_search:
+            llm_kwargs["tools"] = available_tools
+            llm_kwargs["tool_choice"] = "auto"
+        source_links = []
+        response_content = ""
+        tool_calls_data = []
+        # First streaming call
+        stream = client.chat.completions.create(**llm_kwargs)
+        for chunk in stream:
+            delta = chunk.choices[0].delta
+            # Handle content streaming
+            if delta.content:
+                content_chunk = delta.content
+                response_content += content_chunk
+                yield f"data: {json.dumps({'type': 'content', 'data': content_chunk})}\n\n"
+            # Handle tool calls
+            if delta.tool_calls:
+                for tool_call in delta.tool_calls:
+                    if len(tool_calls_data) <= tool_call.index:
+                        tool_calls_data.extend([{"id": "", "function": {"name": "", "arguments": ""}}
+                                              for _ in range(tool_call.index + 1 - len(tool_calls_data))])
+                    if tool_call.id:
+                        tool_calls_data[tool_call.index]["id"] = tool_call.id
+                    if tool_call.function.name:
+                        tool_calls_data[tool_call.index]["function"]["name"] = tool_call.function.name
+                    if tool_call.function.arguments:
+                        tool_calls_data[tool_call.index]["function"]["arguments"] += tool_call.function.arguments
+        # Process tool calls if any
+        if tool_calls_data and any(tc["function"]["name"] for tc in tool_calls_data):
+            yield f"data: {json.dumps({'type': 'status', 'data': 'Searching...'})}\n\n"
+            # Execute searches concurrently for speed
+            search_tasks = []
+            for tool_call in tool_calls_data:
+                if tool_call["function"]["name"] == "google_search":
+                    try:
+                        args = json.loads(tool_call["function"]["arguments"])
+                        query = args.get("query", "").strip()
+                        if query:
+                            search_tasks.append(google_search_tool_async(query))
+                    except json.JSONDecodeError:
+                        continue
+            # Run searches concurrently
+            if search_tasks:
+                search_results_list = await asyncio.gather(*search_tasks, return_exceptions=True)
+                # Combine all search results
+                all_results = []
+                for results in search_results_list:
+                    if isinstance(results, list):
+                        all_results.extend(results)
+                        for result in results:
+                            source_links.append({
+                                "title": result["source_title"],
+                                "url": result["url"],
+                                "domain": result["domain"]
+                            })
+                # Format search results
+                if all_results:
+                    search_context = format_search_results_compact(all_results)
+                    # Create new message with search context
+                    search_messages = messages + [{
+                        "role": "system",
+                        "content": f"{search_context}\n\nPlease provide a comprehensive response based on the search results above."
+                    }]
+                    yield f"data: {json.dumps({'type': 'status', 'data': 'Generating response...'})}\n\n"
+                    # Generate final response with search context
+                    final_stream = client.chat.completions.create(
+                        model="unsloth/Qwen3-30B-A3B-GGUF",
+                        temperature=temperature,
+                        messages=search_messages,
+                        max_tokens=2000,
+                        stream=True
+                    )
+                    for chunk in final_stream:
+                        if chunk.choices[0].delta.content:
+                            content = chunk.choices[0].delta.content
+                            yield f"data: {json.dumps({'type': 'content', 'data': content})}\n\n"
+        # Send sources and completion
+        if source_links:
+            yield f"data: {json.dumps({'type': 'sources', 'data': source_links})}\n\n"
+        yield f"data: {json.dumps({'type': 'done', 'data': {'search_used': bool(source_links)}})}\n\n"
+    except Exception as e:
+        logger.error(f"Streaming error: {e}")
+        yield f"data: {json.dumps({'type': 'error', 'data': str(e)})}\n\n"
+# --- Streaming Chat Endpoint ---
+@app.post("/chat/stream")
+async def chat_stream_endpoint(request: Request, _: None = Depends(verify_origin)):
     if not client:
         raise HTTPException(status_code=500, detail="LLM client not configured")
     try:
         data = await request.json()
         user_message = data.get("message", "").strip()
+        use_search = data.get("use_search", False)  # Default: False
+        temperature = max(0, min(2, data.get("temperature", 0.7)))  # Clamp to valid range
+        conversation_history = data.get("history", [])
+        if not user_message:
+            raise HTTPException(status_code=400, detail="No message provided")
+        # Prepare messages
+        current_date = datetime.now().strftime("%Y-%m-%d")
+        system_content = (SYSTEM_PROMPT_WITH_SEARCH if use_search else SYSTEM_PROMPT_NO_SEARCH).format(current_date=current_date)
+        messages = [{"role": "system", "content": system_content}] + conversation_history + [{"role": "user", "content": user_message}]
+        logger.info(f"Stream request - search: {use_search}, temp: {temperature}")
+        return StreamingResponse(
+            generate_streaming_response(messages, use_search, temperature),
+            media_type="text/plain",
+            headers={
+                "Cache-Control": "no-cache",
+                "Connection": "keep-alive",
+                "X-Accel-Buffering": "no"  # Disable nginx buffering
+            }
+        )
+    except json.JSONDecodeError:
+        raise HTTPException(status_code=400, detail="Invalid JSON")
+    except Exception as e:
+        logger.error(f"Stream endpoint error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+# --- Regular Chat Endpoint (for backward compatibility) ---
+@app.post("/chat")
+async def chat_endpoint(request: Request, _: None = Depends(verify_origin)):
+    if not client:
+        raise HTTPException(status_code=500, detail="LLM client not configured")
+    try:
+        data = await request.json()
+        user_message = data.get("message", "").strip()
+        use_search = data.get("use_search", False)  # Default: False
+        temperature = max(0, min(2, data.get("temperature", 0.7)))
+        conversation_history = data.get("history", [])
         if not user_message:
             raise HTTPException(status_code=400, detail="No message provided")
+        # Prepare messages
         current_date = datetime.now().strftime("%Y-%m-%d")
+        system_content = (SYSTEM_PROMPT_WITH_SEARCH if use_search else SYSTEM_PROMPT_NO_SEARCH).format(current_date=current_date)
+        messages = [{"role": "system", "content": system_content}] + conversation_history + [{"role": "user", "content": user_message}]
+        source_links = []
         if use_search:
+            # Search-enabled flow (non-streaming for compatibility)
+            llm_response = client.chat.completions.create(
+                model="unsloth/Qwen3-30B-A3B-GGUF",
+                temperature=temperature,
+                messages=messages,
+                tools=available_tools,
+                tool_choice="auto",
+                max_tokens=2000
+            )
+            tool_calls = llm_response.choices[0].message.tool_calls
+            if tool_calls:
+                # Execute searches
+                search_tasks = []
+                for tool_call in tool_calls:
+                    if tool_call.function.name == "google_search":
+                        try:
+                            args = json.loads(tool_call.function.arguments)
+                            query = args.get("query", "").strip()
+                            if query:
+                                search_tasks.append(google_search_tool_async(query))
+                        except json.JSONDecodeError:
+                            continue
+                if search_tasks:
+                    search_results_list = await asyncio.gather(*search_tasks, return_exceptions=True)
+                    all_results = []
+                    for results in search_results_list:
+                        if isinstance(results, list):
+                            all_results.extend(results)
+                            for result in results:
                                 source_links.append({
                                     "title": result["source_title"],
                                     "url": result["url"],
                                     "domain": result["domain"]
                                 })
+                    if all_results:
+                        search_context = format_search_results_compact(all_results)
+                        search_messages = messages + [{
+                            "role": "system",
+                            "content": f"{search_context}\n\nPlease provide a comprehensive response based on the search results above."
+                        }]
+                        final_response = client.chat.completions.create(
+                            model="unsloth/Qwen3-30B-A3B-GGUF",
+                            temperature=temperature,
+                            messages=search_messages,
+                            max_tokens=2000
+                        )
+                        final_content = final_response.choices[0].message.content
+                    else:
+                        final_content = llm_response.choices[0].message.content
+                else:
+                    final_content = llm_response.choices[0].message.content
+            else:
+                final_content = llm_response.choices[0].message.content
+        else:
+            # No search - direct response
+            llm_response = client.chat.completions.create(
+                model="unsloth/Qwen3-30B-A3B-GGUF",
+                temperature=temperature,
+                messages=messages,
+                max_tokens=2000
+            )
+            final_content = llm_response.choices[0].message.content
+        return {
+            "response": final_content,
+            "sources": source_links,
+            "search_used": bool(source_links),
+            "temperature": temperature,
+            "timestamp": datetime.now().isoformat()
+        }
     except Exception as e:
+        logger.error(f"Chat endpoint error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+# --- Health Check Endpoints ---
 @app.get("/")
 async def root():
     return {
+        "message": "Streaming AI Chatbot API",
+        "version": "2.1.0",
+        "endpoints": ["/chat", "/chat/stream"],
         "timestamp": datetime.now().isoformat()
     }
 @app.get("/health")
 async def health_check():
+    return {
         "status": "healthy",
         "timestamp": datetime.now().isoformat(),
         "services": {
             "llm_client": client is not None,
             "google_search": bool(GOOGLE_API_KEY and GOOGLE_CX)
         }
+    }