We have this functionality in Chroma DB: Chroma | 🦜️🔗 Langchain
But I can’t find this functionality in Weaviate’s documentation despite a lot of searching.
Here’s my full code for a local document querying app I’m building:
import os, weaviate
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import DirectoryLoader
from langchain.vectorstores.weaviate import Weaviate
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
# get environment variables
OPENAI_API_KEY = os.environ.get["OPENAI_API_KEY"]
WEAVIATE_API_KEY = os.environ.get["WEAVIATE_API_KEY"]
# load documents
doc_loader = DirectoryLoader(
'./Docs', # the relative directory address
glob='**/*.pdf', # load all pdf files in every subdirectory
show_progress=True
)
docs = doc_loader.load()
# split documents
splitter = CharacterTextSplitter(
chunk_size=1000,
chunk_overlap=300
)
splitted_docs_list = splitter.split_documents(docs)
auth_config = weaviate.auth.AuthApiKey(api_key=os.environ.get('WEAVIATE_API_KEY'))
client = weaviate.Client(
url="https://weaviate-sandbox-cluster-xxxxxxx.weaviate.network",
auth_client_secret=auth_config,
additional_headers={
"X-OpenAI-Api-Key": os.environ.get('OPENAI_API_KEY')
}
)
# set index_name and vectorizer for the database
class_obj = {
"class": "LangChain",
"vectorizer": "text2vec-openai",
}
try:
# Add the class to the schema
client.schema.create_class(class_obj)
except:
print("Class already exists")
embeddings = OpenAIEmbeddings()
# I use 'LangChain' for index_name and 'text' for text_key
vectorstore = Weaviate(client, "LangChain", "text", embedding=embeddings)
# Add text chunks' embeddings to the Weaviate vector database
texts = [d.page_content for d in splitted_docs_list]
metadatas = [d.metadata for d in splitted_docs_list]
# vectorstore.add_texts(texts, metadatas=metadatas, embedding=embeddings)
vectorstore = Weaviate.from_texts(
texts,
embeddings,
metadatas=metadatas,
client=client,
)
# Query the vectorstore with the LLM
llm = ChatOpenAI()
retrieval_qa = RetrievalQA.from_chain_type(
llm=llm,
chain_type='stuff',
retriever=vectorstore.as_retriever(),
)
print(retrieval_qa.run(query))
Everytime I run a query, all the docs are read, split into chunks, and the vectorstore is created, again and again. So, is it possible to save the vector store in disk instead and resuse it for further querying?