Hi,
I am trying to use WeaviateHybridSearchRetriever from Langchain and getting this error when I query:
I am using custom vectors. This is my code
# For using WCS
import weaviate
import json
import os
from langchain.vectorstores import Weaviate
from langchain.llms import OpenAI
from langchain.chains import ChatVectorDBChain
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import TokenTextSplitter, CharacterTextSplitter
from langchain.retrievers.weaviate_hybrid_search import WeaviateHybridSearchRetriever
from langchain.schema import Document
import numpy as np
import json
WEAVIATEURL =
client = weaviate.Client(
url = WEAVIATEURL, # Replace with your endpoint
auth_client_secret=weaviate.AuthApiKey(api_key=API_KEY), # Replace w/ your Weaviate instance API key
)
meta_info = client.get_meta()
print(json.dumps(meta_info, indent=2))
client.schema.get()
###############################################################################################
pdf_path = dir_of_pdfs
loader = PyPDFDirectoryLoader(pdf_path)
documents = loader.load()
text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=25)
docs = text_splitter.split_documents(documents)
print (fâNow you have {len(docs)} documentsâ)
embed_model_id = âsentence-transformers/all-MiniLM-L6-v2â
embed_model = HuggingFaceEmbeddings(
model_name=embed_model_id,
model_kwargs={âdeviceâ: âcpuâ},
encode_kwargs={âdeviceâ: âcpuâ, âbatch_sizeâ: 32}
)
doc_func_source = lambda x: x.metadata[âsourceâ]
doc_func_content = lambda x: x.page_content
content_docs = list(map(doc_func_content, docs))
source_docs = list(map(doc_func_source, docs))
embeddings = embed_model.embed_documents(content_docs)
print (len(embeddings))
print (len(embeddings[0]))
############################################################################################################
#writing my vectors in JSON
Format the vectors in the desired JSON structure
combined_data = [{âContentâ: content, âSourceâ: source, âVectorâ: vector}
for content, source, vector in zip(content_docs, source_docs, embeddings)]
Dump the formatted vectors to a JSON file
with open(âdata.jsonâ, âwâ) as json_file:
json.dump(combined_data, json_file)
#############################################################################################################
#creating class
if client.schema.exists(âRagdocsâ):
client.schema.delete_class(âRagdocsâ)
schema for class
class_obj = {
âclassâ :âRagdocsâ,
âvectorizerâ: ânoneâ # If set to ânoneâ you must always provide vectors yourself. Could be any other âtext2vec-*â also.
}
client.schema.create_class(class_obj) # returns null on success
####################################################################################################################
#importing data
Read and load the JSON data from the file
with open(âdata.jsonâ, ârâ) as file:
data = json.load(file)
Configure batch process
client.batch.configure(batch_size=1)
Configure a batch process
with client.batch as batch:
# Batch import all Questions
for i, d in enumerate(data):
print(f"importing document no: {i+1}")
properties = {
"text": d["Content"],
"source": d["Source"],
}
custom_vector = d["Vector"]
client.batch.add_data_object(
properties,
"Ragdocs",
vector=custom_vector
)
##########################################################################################################################
retriever = WeaviateHybridSearchRetriever(
client=client,
index_name=âRagdocsâ, #index name is class name
text_key=âtextâ,
attributes=,
create_schema_if_missing=True,
)
code ran without any issue until here
################################################################################################################################
retriever.get_relevant_documents(âwhat is OpenVINO?â)
From the error it looks like I need to provide vectors for the query separately as I am not using the vectorizer module of weaviate? How do I provide vectors for the query through the retriever?