Weaviate Hybrid Retriever issue in Langchain for custom vectors

Hi,

I am trying to use WeaviateHybridSearchRetriever from Langchain and getting this error when I query:

I am using custom vectors. This is my code

# For using WCS

import weaviate
import json
import os
from langchain.vectorstores import Weaviate
from langchain.llms import OpenAI
from langchain.chains import ChatVectorDBChain
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import TokenTextSplitter, CharacterTextSplitter
from langchain.retrievers.weaviate_hybrid_search import WeaviateHybridSearchRetriever
from langchain.schema import Document
import numpy as np
import json

WEAVIATEURL =

client = weaviate.Client(
url = WEAVIATEURL, # Replace with your endpoint
auth_client_secret=weaviate.AuthApiKey(api_key=API_KEY), # Replace w/ your Weaviate instance API key
)

meta_info = client.get_meta()
print(json.dumps(meta_info, indent=2))
client.schema.get()

###############################################################################################

pdf_path = dir_of_pdfs
loader = PyPDFDirectoryLoader(pdf_path)
documents = loader.load()

text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=25)
docs = text_splitter.split_documents(documents)
print (f’Now you have {len(docs)} documents’)

embed_model_id = ‘sentence-transformers/all-MiniLM-L6-v2’

embed_model = HuggingFaceEmbeddings(
model_name=embed_model_id,
model_kwargs={‘device’: “cpu”},
encode_kwargs={‘device’: “cpu”, ‘batch_size’: 32}
)

doc_func_source = lambda x: x.metadata[‘source’]
doc_func_content = lambda x: x.page_content
content_docs = list(map(doc_func_content, docs))
source_docs = list(map(doc_func_source, docs))

embeddings = embed_model.embed_documents(content_docs)
print (len(embeddings))
print (len(embeddings[0]))

############################################################################################################

#writing my vectors in JSON

Format the vectors in the desired JSON structure

combined_data = [{“Content”: content, “Source”: source, “Vector”: vector}
for content, source, vector in zip(content_docs, source_docs, embeddings)]

Dump the formatted vectors to a JSON file

with open(‘data.json’, ‘w’) as json_file:
json.dump(combined_data, json_file)

#############################################################################################################

#creating class

if client.schema.exists(“Ragdocs”):
client.schema.delete_class(“Ragdocs”)

schema for class

class_obj = {
‘class’ :‘Ragdocs’,
“vectorizer”: “none” # If set to “none” you must always provide vectors yourself. Could be any other “text2vec-*” also.

}

client.schema.create_class(class_obj) # returns null on success

####################################################################################################################

#importing data

Read and load the JSON data from the file

with open(‘data.json’, ‘r’) as file:
data = json.load(file)

Configure batch process

client.batch.configure(batch_size=1)

Configure a batch process

with client.batch as batch:
# Batch import all Questions
for i, d in enumerate(data):
print(f"importing document no: {i+1}")

    properties = {
        "text": d["Content"],
        "source": d["Source"],            
    }
   
    custom_vector = d["Vector"]  


    client.batch.add_data_object(
        properties,
        "Ragdocs",
        vector=custom_vector
    )

##########################################################################################################################

retriever = WeaviateHybridSearchRetriever(
client=client,
index_name=“Ragdocs”, #index name is class name
text_key=“text”,
attributes=,
create_schema_if_missing=True,
)

code ran without any issue until here

################################################################################################################################
retriever.get_relevant_documents(“what is OpenVINO?”)

From the error it looks like I need to provide vectors for the query separately as I am not using the vectorizer module of weaviate? How do I provide vectors for the query through the retriever?

Hi!

Have you seen this python notebook?

I believe that you need to initiate Weaviate client passing the OpenAI API Key so it can vectorize your query.

here:

import os

import weaviate

WEAVIATE_URL = os.getenv("WEAVIATE_URL")
auth_client_secret = (weaviate.AuthApiKey(api_key=os.getenv("WEAVIATE_API_KEY")),)
client = weaviate.Client(
    url=WEAVIATE_URL,
    additional_headers={
        "X-Openai-Api-Key": os.getenv("OPENAI_API_KEY"),
    },
)

# client.schema.delete_all()

Also, you will probabyl find this recipe interesting :wink:

Let me know if that helps :slight_smile: