| | import json |
| | import os |
| | from llm_utils import generate_with_retry |
| | import google.generativeai as genai |
| | from dotenv import load_dotenv |
| |
|
| | load_dotenv() |
| |
|
| | LOG_FILE = "rag_eval_logs.jsonl" |
| | MODEL_NAME = "gemini-2.5-flash" |
| | API_KEY = os.getenv("GEMINI_API_KEY") |
| |
|
| | if not API_KEY: |
| | print("❌ GEMINI_API_KEY not found in env.") |
| | exit(1) |
| |
|
| | genai.configure(api_key=API_KEY) |
| |
|
| | def calculate_faithfulness(answer, contexts): |
| | """ |
| | Score 0.0 to 1.0 |
| | Measure: Is the answer derived *only* from the context? |
| | """ |
| | if not contexts: return 0.0 |
| | |
| | context_text = "\n".join(contexts) |
| | prompt = f""" |
| | You are an AI Judge. |
| | Rate the 'Faithfulness' of the Answer to the Context on a scale of 0.0 to 1.0. |
| | 1.0 = Answer is strictly derived from Context. |
| | 0.0 = Answer contains hallucinations or info not in Context. |
| | |
| | Context: {context_text[:3000]} |
| | |
| | Answer: {answer} |
| | |
| | Return ONLY a single float number (e.g. 0.9). |
| | """ |
| | model = genai.GenerativeModel(MODEL_NAME) |
| | try: |
| | resp = model.generate_content(prompt) |
| | score = float(resp.text.strip()) |
| | return max(0.0, min(1.0, score)) |
| | except: |
| | return 0.5 |
| |
|
| | def calculate_relevancy(query, answer): |
| | """ |
| | Score 0.0 to 1.0 |
| | Measure: Does the answer directly address the query? |
| | """ |
| | prompt = f""" |
| | You are an AI Judge. |
| | Rate the 'Relevancy' of the Answer to the Query on a scale of 0.0 to 1.0. |
| | 1.0 = Answer directly addresses the query. |
| | 0.0 = Answer is unrelated or ignores the user. |
| | |
| | Query: {query} |
| | Answer: {answer} |
| | |
| | Return ONLY a single float number (e.g. 0.9). |
| | """ |
| | model = genai.GenerativeModel(MODEL_NAME) |
| | try: |
| | resp = model.generate_content(prompt) |
| | score = float(resp.text.strip()) |
| | return max(0.0, min(1.0, score)) |
| | except: |
| | return 0.5 |
| |
|
| | def run_audit(): |
| | if not os.path.exists(LOG_FILE): |
| | print(f"No log file found at {LOG_FILE}") |
| | return |
| |
|
| | print(f"📊 Running Post-Hoc Audit on {LOG_FILE}...\n") |
| | print(f"{'Query':<30} | {'Faithful':<10} | {'Relevancy':<10}") |
| | print("-" * 60) |
| | |
| | total_f = 0 |
| | total_r = 0 |
| | count = 0 |
| | |
| | with open(LOG_FILE, "r", encoding="utf-8") as f: |
| | for line in f: |
| | try: |
| | data = json.loads(line) |
| | |
| | if "final_answer" not in data or not data["final_answer"]: |
| | continue |
| | |
| | q = data["query"] |
| | a = data["final_answer"] |
| | c = data.get("context_list", []) |
| | |
| | f_score = calculate_faithfulness(a, c) |
| | r_score = calculate_relevancy(q, a) |
| | |
| | print(f"{q[:30]:<30} | {f_score:.2f} | {r_score:.2f}") |
| | |
| | total_f += f_score |
| | total_r += r_score |
| | count += 1 |
| | except Exception as e: |
| | pass |
| | |
| | if count > 0: |
| | print("-" * 60) |
| | print(f"\n✅ Audit Complete.") |
| | print(f"Average Faithfulness: {total_f/count:.2f}") |
| | print(f"Average Relevancy: {total_r/count:.2f}") |
| | else: |
| | print("\n⚠️ No complete records found to audit. Ask some questions first!") |
| |
|
| | if __name__ == "__main__": |
| | run_audit() |
| |
|