Update app.py
Browse files
app.py
CHANGED
|
@@ -55,8 +55,10 @@ st.write("---------------------------------")
|
|
| 55 |
|
| 56 |
st.write("LIST OF ALL THE LOADED DOCUMENTS: ")
|
| 57 |
st.write("")
|
| 58 |
-
pdf_files
|
| 59 |
-
|
|
|
|
|
|
|
| 60 |
st.write(file)
|
| 61 |
|
| 62 |
st.write("---------------------------------")
|
|
@@ -121,6 +123,43 @@ if "vector" not in st.session_state:
|
|
| 121 |
loader = PyPDFDirectoryLoader(path, glob="**/*.pdf")
|
| 122 |
docs = loader.load()
|
| 123 |
st.session_state.docs = docs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
st.session_state.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
| 126 |
st.session_state.documents = st.session_state.text_splitter.split_documents(st.session_state.docs)
|
|
|
|
| 55 |
|
| 56 |
st.write("LIST OF ALL THE LOADED DOCUMENTS: ")
|
| 57 |
st.write("")
|
| 58 |
+
# pdf_files = glob.glob("*.pdf")
|
| 59 |
+
word_files = glob.glob("*.docx")
|
| 60 |
+
# for file in pdf_files:
|
| 61 |
+
for file in word_files:
|
| 62 |
st.write(file)
|
| 63 |
|
| 64 |
st.write("---------------------------------")
|
|
|
|
| 123 |
loader = PyPDFDirectoryLoader(path, glob="**/*.pdf")
|
| 124 |
docs = loader.load()
|
| 125 |
st.session_state.docs = docs
|
| 126 |
+
|
| 127 |
+
# JB 18-03-2024:
|
| 128 |
+
# https://python.langchain.com/docs/integrations/document_loaders/
|
| 129 |
+
# MICROSOFT WORD:
|
| 130 |
+
# https://python.langchain.com/docs/integrations/document_loaders/microsoft_word
|
| 131 |
+
# 1 - Using Docx2txt
|
| 132 |
+
# Load .docx using Docx2txt into a document.
|
| 133 |
+
# %pip install --upgrade --quiet docx2txt
|
| 134 |
+
# from langchain_community.document_loaders import Docx2txtLoader
|
| 135 |
+
# loader = Docx2txtLoader("example_data/fake.docx")
|
| 136 |
+
# data = loader.load()
|
| 137 |
+
# data
|
| 138 |
+
# [Document(page_content='Lorem ipsum dolor sit amet.', metadata={'source': 'example_data/fake.docx'})]
|
| 139 |
+
#
|
| 140 |
+
# 2A - Using Unstructured
|
| 141 |
+
# from langchain_community.document_loaders import UnstructuredWordDocumentLoader
|
| 142 |
+
# loader = UnstructuredWordDocumentLoader("example_data/fake.docx")
|
| 143 |
+
# data = loader.load()
|
| 144 |
+
# data
|
| 145 |
+
# [Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 'fake.docx'}, lookup_index=0)]
|
| 146 |
+
#
|
| 147 |
+
# 2B - Retain Elements
|
| 148 |
+
# Under the hood, Unstructured creates different “elements” for different chunks of text.
|
| 149 |
+
# By default we combine those together, but you can easily keep that separation by specifying mode="elements".
|
| 150 |
+
# loader = UnstructuredWordDocumentLoader("example_data/fake.docx", mode="elements")
|
| 151 |
+
# data = loader.load()
|
| 152 |
+
# data[0]
|
| 153 |
+
# Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 'fake.docx', 'filename': 'fake.docx', 'category': 'Title'}, lookup_index=0)
|
| 154 |
+
#
|
| 155 |
+
# 2A - Using Unstructured
|
| 156 |
+
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
|
| 157 |
+
loader = UnstructuredWordDocumentLoader(path, glob="**/*.docx")
|
| 158 |
+
docs = loader.load()
|
| 159 |
+
st.session_state.docs = docs
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
|
| 163 |
|
| 164 |
st.session_state.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
| 165 |
st.session_state.documents = st.session_state.text_splitter.split_documents(st.session_state.docs)
|