I am trying to use WeaviateHybridSearchRetriever from Langchain and getting this error when I query:
I am using custom vectors. This is my code
# For using WCS
import weaviate
import json
import os
from langchain.vectorstores import Weaviate
from langchain.llms import OpenAI
from langchain.chains import ChatVectorDBChain
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import TokenTextSplitter, CharacterTextSplitter
from langchain.retrievers.weaviate_hybrid_search import WeaviateHybridSearchRetriever
from langchain.schema import Document
import numpy as np
import json
client = weaviate.Client(
url = WEAVIATEURL, # Replace with your endpoint
auth_client_secret=weaviate.AuthApiKey(api_key=API_KEY), # Replace w/ your Weaviate instance API key
meta_info = client.get_meta()
print(json.dumps(meta_info, indent=2))
pdf_path = dir_of_pdfs
loader = PyPDFDirectoryLoader(pdf_path)
documents = loader.load()
text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=25)
docs = text_splitter.split_documents(documents)
print (f’Now you have {len(docs)} documents’)
embed_model_id = ‘sentence-transformers/all-MiniLM-L6-v2’
embed_model = HuggingFaceEmbeddings(
model_kwargs={‘device’: “cpu”},
encode_kwargs={‘device’: “cpu”, ‘batch_size’: 32}
doc_func_source = lambda x: x.metadata[‘source’]
doc_func_content = lambda x: x.page_content
content_docs = list(map(doc_func_content, docs))
source_docs = list(map(doc_func_source, docs))
embeddings = embed_model.embed_documents(content_docs)
print (len(embeddings))
print (len(embeddings[0]))
#writing my vectors in JSON
Format the vectors in the desired JSON structure
combined_data = [{“Content”: content, “Source”: source, “Vector”: vector}
for content, source, vector in zip(content_docs, source_docs, embeddings)]
Dump the formatted vectors to a JSON file
with open(‘data.json’, ‘w’) as json_file:
json.dump(combined_data, json_file)
#creating class
if client.schema.exists(“Ragdocs”):
schema for class
class_obj = {
‘class’ :‘Ragdocs’,
“vectorizer”: “none” # If set to “none” you must always provide vectors yourself. Could be any other “text2vec-*” also.
client.schema.create_class(class_obj) # returns null on success
#importing data
Read and load the JSON data from the file
with open(‘data.json’, ‘r’) as file:
data = json.load(file)
Configure batch process
Configure a batch process
with client.batch as batch:
# Batch import all Questions
for i, d in enumerate(data):
print(f"importing document no: {i+1}")
properties = {
"text": d["Content"],
"source": d["Source"],
custom_vector = d["Vector"]
retriever = WeaviateHybridSearchRetriever(
index_name=“Ragdocs”, #index name is class name
code ran without any issue until here
retriever.get_relevant_documents(“what is OpenVINO?”)
From the error it looks like I need to provide vectors for the query separately as I am not using the vectorizer module of weaviate? How do I provide vectors for the query through the retriever?