Fabelous-Ai-Chat/scripts/Rag.py

97 lines
4.0 KiB
Python
Raw Normal View History

from langchain_community.document_loaders import UnstructuredHTMLLoader, WebBaseLoader, UnstructuredMarkdownLoader, PyPDFLoader
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.chat_models import ChatOllama
from langchain.chains import RetrievalQA
2024-05-20 10:27:03 +00:00
import requests
class Rag:
def __init__(self, embeddings, model,
base_url_llm,base_url_embed, base_header,
embeddings_header):
self.retriever = None
self.data = None
self.model = model
self.base_url_llm = base_url_llm
self.base_url_embed = base_url_embed
2024-05-20 11:24:41 +00:00
# check if header exists
if self.is_empty(base_header):
self.base_header = ""
else:
self.base_header = base_header
if self.is_empty(embeddings_header):
self.embeddings_header = ""
else:
self.embeddings_header = embeddings_header
self.embeddings = OllamaEmbeddings(model=embeddings, headers=self.embeddings_header, base_url=self.base_url_embed)
def init_ollama(self):
self.chat_ollama = ChatOllama(
base_url=self.base_url_llm,
model=self.model,
headers = self.base_header
)
2024-05-20 11:24:41 +00:00
def get_file(self, file_path: str):
# Check if the file path starts with 'https://'
if file_path.startswith('https://'):
2024-05-20 10:27:03 +00:00
try:
loader = WebBaseLoader(file_path)
data = loader.load()
if data is None:
return False
except requests.exceptions.SSLError:
return False
else:
file_type = file_path.split(".")[-1]
try:
match file_type:
case "csv":
loader = CSVLoader(file_path=file_path)
data = loader.load()
case "html": # Corrected the typo in the variable name
loader = UnstructuredHTMLLoader(file_path=file_path)
data = loader.load()
case "md":
loader = UnstructuredMarkdownLoader(file_path=file_path)
data = loader.load()
case "pdf":
loader = PyPDFLoader(file_path=file_path)
data = loader.load_and_split()
case _:
2024-05-20 10:27:03 +00:00
return False
except OSError:
return False
self.data = data
return True
2024-05-20 11:24:41 +00:00
def is_empty(self, dictionary: dict) -> bool:
return len(dictionary) == 1 and list(dictionary.keys())[0] == '' and list(dictionary.values())[0] == ''
2024-05-20 11:24:41 +00:00
def receive_data(self, file_path: str):
try:
if self.get_file(file_path):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=0)
splitted = text_splitter.split_documents(self.data)
self.retriever = Chroma.from_documents(documents=splitted, embedding=self.embeddings).as_retriever()
return (False, "Success")
else:
return (True, f"'{file_path}' unsupported, read documentation for more information")
2024-05-19 13:33:03 +00:00
except (ValueError, AttributeError) as e:
return (True, f"An unexpected Error occuried: {e}")
2024-05-20 11:24:41 +00:00
def get_request(self, prompt: str) -> str:
qachain=RetrievalQA.from_chain_type(self.chat_ollama, retriever=self.retriever)
try:
return qachain.invoke({"query": prompt})["result"]
2024-05-19 13:33:03 +00:00
except ValueError as e:
2024-05-20 11:24:41 +00:00
return f"An unexpected Error occuried: {e}"