Spaces:

GIZ
/

audit_assistant

Running on T4

App Files Files Community

ppsingh commited on Jul 10, 2024

Commit

723ac7e

verified ·

1 Parent(s): d4a2dd9

Update app.py

Browse files

Files changed (1) hide show

app.py +99 -8

app.py CHANGED Viewed

@@ -8,9 +8,16 @@ import re
 import json
 from auditqa.sample_questions import QUESTIONS
 from auditqa.reports import POSSIBLE_REPORTS
-from auditqa.engine.prompts import audience_prompts
 from auditqa.doc_process import process_pdf
-process_pdf()
 async def chat(query,history,audience,sources,reports):
     """taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
@@ -21,6 +28,9 @@ async def chat(query,history,audience,sources,reports):
     print(f"audience:{audience}")
     print(f"sources:{sources}")
     print(f"reports:{reports}")
     if audience == "Children":
         audience_prompt = audience_prompts["children"]
@@ -33,20 +43,101 @@ async def chat(query,history,audience,sources,reports):
     # Prepare default values
     if len(sources) == 0:
-        sources = ["IPCC"]
     if len(reports) == 0:
         reports = []
-    history = [tuple(x) for x in history]
-    docs_html = ""
-    output_query = ""
-    output_language = "ENG"
     yield history,docs_html,output_query,output_language
 # --------------------------------------------------------------------
 # Gradio
 # --------------------------------------------------------------------

 import json
 from auditqa.sample_questions import QUESTIONS
 from auditqa.reports import POSSIBLE_REPORTS
+from auditqa.engine.prompts import audience_prompts, answer_prompt_template
 from auditqa.doc_process import process_pdf
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain.llms import HuggingFaceEndpoint
+from dotenv import load_dotenv
+load_dotenv()
+HF_token = os.environ["HF_TOKEN"]
+vectorstores = process_pdf()
 async def chat(query,history,audience,sources,reports):
     """taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
     print(f"audience:{audience}")
     print(f"sources:{sources}")
     print(f"reports:{reports}")
+    docs_html = ""
+    output_query = ""
+    output_language = "english"
     if audience == "Children":
         audience_prompt = audience_prompts["children"]
     # Prepare default values
     if len(sources) == 0:
+        sources = ["ABC"]
     if len(reports) == 0:
         reports = []
+    if sources == ["ABC"]:
+        vectorstore = vectorstores["ABC"]
+    else:
+        vectorstore = vectorstores["XYZ"]
+# get context
+    context_retrieved_lst = []
+    question_lst= [query]
+    for question in question_lst:
+        retriever = vectorstore.as_retriever(
+          search_type="similarity",
+          search_kwargs={"k": 1})
+        context_retrieved = retriever.get_relevant_documents(question)
+        def format_docs(docs):
+            return "\n\n".join(doc.page_content for doc in docs)
+        context_retrieved_formatted = format_docs(context_retrieved)
+        context_retrieved_lst.append(context_retrieved_formatted)
+# get prompt
+    prompt = ChatPromptTemplate.from_template(answer_prompt_template)
+# get llm
+    llm_qa = HuggingFaceEndpoint(
+      endpoint_url= "https://fesg9gjsfde5yfr4.us-east-1.aws.endpoints.huggingface.cloud",
+      task="text-generation",
+      huggingfacehub_api_token=HF_token,
+      model_kwargs={})
+# create rag chain
+    chain = prompt | llm_qa | StrOutputParser()
+# get answers
+    answer_lst = []
+    for question, context in zip(question_list , context_retrieved_lst):
+        answer = chain.invoke({"context": context, "question": question,'audience':audience_prompt, 'language':'english'})
+        answer_lst.append(answer)
+    docs_html = []
+    for i, d in enumerate(context_retrieved, 1):
+        docs_html.append(make_html_source(d, i))
+    docs_html = "".join(docs_html)
+    previous_answer = history[-1][1]
+    previous_answer = previous_answer if previous_answer is not None else ""
+    answer_yet = previous_answer + answer_lst[0]
+    answer_yet = parse_output_llm_with_sources(answer_yet)
+    history[-1] = (query,answer_yet)
+    history = [tuple(x) for x in history]
     yield history,docs_html,output_query,output_language
+def make_html_source(source,i):
+    meta = source.metadata
+    # content = source.page_content.split(":",1)[1].strip()
+    content = source.page_content.strip()
+    toc_levels = []
+    for j in range(2):
+        level = meta[f"toc_level{j}"]
+        if level != "N/A":
+            toc_levels.append(level)
+        else:
+            break
+    toc_levels = " > ".join(toc_levels)
+    if len(toc_levels) > 0:
+        name = f"<b>{toc_levels}</b><br/>{meta['name']}"
+    else:
+        name = meta['name']
+    if meta["chunk_type"] == "text":
+        card = f"""
+    <div class="card" id="doc{i}">
+        <div class="card-content">
+            <h2>Doc {i} - {meta['short_name']} - Page {int(meta['page_number'])}</h2>
+            <p>{content}</p>
+        </div>
+        <div class="card-footer">
+            <span>{name}</span>
+            <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
+                <span role="img" aria-label="Open PDF">🔗</span>
+            </a>
+        </div>
+    </div>
+    """
+    return card
 # --------------------------------------------------------------------
 # Gradio
 # --------------------------------------------------------------------