Near_vector() input validator bug?

Description

I am using near_vector() for similarity search and I am getting the following error:
File “C:\Users\ikim1\RAG-blog\lib\site-packages\weaviate\validator.py”, line 61, in
return all(isinstance(val, args[0]) for val in value)
TypeError: isinstance() arg 2 must be a type or tuple of types

Server Setup Information

  • Weaviate Server Version: Using Local Docker hosting. image: cr.weaviate.io/semitechnologies/weaviate:1.28.3
  • Deployment Method: Docker (straight from the Weaviate website on how to Create Local Docker instance)
  • Multi Node? Number of Running Nodes: 1
  • Client Language and Version: Sorry… I am a software newbie…Not sure what these mean. Python 3.11?
  • Multitenancy?:

Any additional Information

I have used the following method to embed the text to float list:
embedding_model = HuggingFaceEmbeddings(model_name = ‘sentence-transformers/all-mpnet-base-v2’)
embedded_chunk = embedding_model.embed_documents(chunk)[0]
chunks_with_metadata_list.append(wvc.data.DataObject(
properties = chunk_metadata,
vector = embedded_chunk
))
Where the chunks are made with text_splitter from LangChain.
This is the full code:

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
import weaviate
import weaviate.classes as wvc
import weaviate.classes.config as wc
import json
import os

client = weaviate.connect_to_local()
# check if the client is alive
assert client.is_live()

# delete the existing schema + create schema 
client.collections.delete("AutomotiveFinance")
client.collections.create(
    name = 'AutomotiveFinance',
    properties=[
        wc.Property(name = 'page1', data_type = wc.DataType.INT),
        wc.Property(name = 'page2', data_type = wc.DataType.INT),
        wc.Property(name = 'company', data_type = wc.DataType.TEXT),
        wc.Property(name = 'doc_type', data_type = wc.DataType.TEXT),
        wc.Property(name = 'raw_text', data_type = wc.DataType.TEXT),
    ], 
    vectorizer_config=wvc.config.Configure.Vectorizer.none(),
)
auto_finance = client.collections.get("AutomotiveFinance")

# get the path of each json file
json_top_file_path = r'C:\Users\ikim1\OneDrive\Desktop\RAG file'
json_file_path = []
for file in os.listdir(json_top_file_path):
    if file.endswith('.json'):
        file_path = os.path.join(json_top_file_path, file)
        json_file_path.append(file_path)

# Initialize the text splitter +  embedding model
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,  # Maximum size of each chunk
    chunk_overlap=100  # Overlap between consecutive chunks
)
embedding_model = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-mpnet-base-v2')
# for each file path of json get the chunks. 

chunks_with_metadata_list = []
for one_json_path in json_file_path: 
    # each json had the following structure
    # json['pages'], json['file_path'], json['company'] json['doc_type']
    # in json['pages'], there is list of each page as element 
    
    # open the json file 
    with open(one_json_path, 'r') as file: 
        json_data = json.load(file)
        pages = json_data['pages']
        company = json_data['company']
        doc_type = json_data['doc_type']
        
        # make the entire string from the pages
        # make sure to insert the page numbers as well. 
        old_page_num = 0; old_md = ''; old_raw_txt = ''
        json_string = '' 
        for i, page in enumerate(pages): 
            md = page['md']
            raw_txt = page['text']
            page_num = page['page']
            print(i)
            # if this is the second one, then start the chunking process
            if i > 0: 
                old_combined_str = "THIS IS PAGE " + str(old_page_num) + '\n' + old_md + '\n' + old_raw_txt
                new_combined_str = "THIS IS PAGE " + str(page_num) + '\n' + md + '\n' + raw_txt
                combined_str = new_combined_str + '\n' + old_combined_str
                # chunk the combined_str using recursive splittin,g but inject the metadata. 
                chunks = text_splitter.split_text(combined_str)
                # inject the metadata into the chunks
                for chunk in chunks: 
                    # embed the chunk : output is already a list. so no need for conversion for Weaviate
                    embedded_chunk = embedding_model.embed_documents(chunk)[0]
                    chunk_metadata = {
                        "page1" : old_page_num, "page2" : page_num, 
                        "company" : company, "doc_type" : doc_type, 
                        'raw_text' : chunk
                        }
                    chunks_with_metadata_list.append(wvc.data.DataObject(
                        properties = chunk_metadata,
                        vector = embedded_chunk
                    ))
            # cache the previous one
            old_md = md
            old_raw_txt = raw_txt
            old_page_num = page_num
            
auto_finance.data.insert_many(chunks_with_metadata_list)

query_vector = embedding_model.embed_documents("what is Honda Cash and cash equivalents?")[0]
import time 
time.sleep(2)
response = auto_finance.query.near_vector(
    near_vector = query_vector,
    limit = 4  
)

for o in response.objects:
    print(o.properties['raw_text'])
    print(o.properties['company'])
    print(o.properties['page1'])
    print(o.properties['page2'])
client.close()

Then I followed this section to do near_vector() search:

from weaviate.classes.query import MetadataQuery
import weaviate
from langchain_huggingface import HuggingFaceEmbeddings
import numpy as np
import base64

embedding_model = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-mpnet-base-v2')
client = weaviate.connect_to_local()
auto_finance= client.collections.get("AutomotiveFinance")

query_vector = embedding_model.embed_documents("what is Honda Cash and cash equivalents?")[0]
# I checked query_Vector is list of floats

import time 
time.sleep(2)
response = auto_finance.query.near_vector(
    near_vector = query_vector,
    limit = 4  
)

for o in response.objects:
    print(o.properties['raw_text'])
    print(o.properties['company'])
    print(o.properties['page1'])
    print(o.properties['page2'])
client.close()

But I am getting the following error:

Traceback (most recent call last):
  File "c:/Users/ikim1/OneDrive/Desktop/RAG file/SimSearch.py", line 16, in <module>
    response = auto_finance.query.near_vector(
  File "C:\Users\ikim1\RAG-blog\lib\site-packages\weaviate\syncify.py", line 23, in sync_method
    return _EventLoopSingleton.get_instance().run_until_complete(
  File "C:\Users\ikim1\RAG-blog\lib\site-packages\weaviate\event_loop.py", line 40, in run_until_complete       
    return fut.result()
  File "C:\Users\ikim1\AppData\Local\Programs\Python38\lib\concurrent\futures\_base.py", line 439, in result    
    return self.__get_result()
  File "C:\Users\ikim1\AppData\Local\Programs\Python38\lib\concurrent\futures\_base.py", line 388, in __get_result
    raise self._exception
  File "C:\Users\ikim1\RAG-blog\lib\site-packages\weaviate\collections\queries\near_vector\query.py", line 92, in near_vector
    res = await self._query.near_vector(
  File "C:\Users\ikim1\RAG-blog\lib\site-packages\weaviate\collections\grpc\query.py", line 361, in near_vector 
    _validate_input(
  File "C:\Users\ikim1\RAG-blog\lib\site-packages\weaviate\validator.py", line 31, in _validate_input
    if not any(_is_valid(exp, validate.value) for exp in validate.expected):
  File "C:\Users\ikim1\RAG-blog\lib\site-packages\weaviate\validator.py", line 31, in <genexpr>
    if not any(_is_valid(exp, validate.value) for exp in validate.expected):
  File "C:\Users\ikim1\RAG-blog\lib\site-packages\weaviate\validator.py", line 61, in _is_valid
    return all(isinstance(val, args[0]) for val in value)
  File "C:\Users\ikim1\RAG-blog\lib\site-packages\weaviate\validator.py", line 61, in <genexpr>
    return all(isinstance(val, args[0]) for val in value)
TypeError: isinstance() arg 2 must be a type or tuple of types

I think i followed every steps - if not. please let me know and I can test them out. But from what I have seen, args[0] is outputting ~T because the “expected” is typing.List. Not really sure why it is coded like this but also please let me know Thanks!

As a lazy/quick patch, I just commented out line 61 and then did “pass” and it seems to be getting the documents well… I know this is NOT the way it is supposed to be done, but in case it helps anyone.

1 Like

Hi! @Inkyu_Kim !! Welcome to our community :hugs:

I believe I have answered this same issue here:

I was not able to reproduce it.

For reference, this is the minimal reproducible example:

import weaviate
from langchain_huggingface import HuggingFaceEmbeddings
import weaviate.classes.config as wc
from weaviate import classes as wvc

client = weaviate.connect_to_local()
print(f"Client: {weaviate.__version__}, Server: {client.get_meta().get('version')}")

client.collections.delete("Test")
collection = client.collections.create(
    name = 'Test',
    properties=[
        wc.Property(name = 'text', data_type = wc.DataType.TEXT),
    ]
)

embedding_model = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-mpnet-base-v2')
texts = [
    "A dog is a member of the genus Canis that are carnivorous mammals.",
    "A cat is a member of the genus Felis that are carnivorous mammals.",
    "A seagull is a member of the genus Larus that are carnivorous birds, can flap it's wings and go up im the skies",
]
text_vector = embedding_model.embed_documents(texts)

# lets add our objects with their vectors
for i in range(len(texts)):
    collection.data.insert(
        properties={"text": texts[i]},
        vector=text_vector[i]
    )
# check if we have the objects:
print(collection.aggregate.over_all().total_count)

# we need to wait a little bit for it to index
import time
time.sleep(2)
# now we query
query = "will bark at you"
query_vector = embedding_model.embed_documents(query)[0]
#query_vector = [0.123]*768
result = collection.query.hybrid(
    query = query,
    vector = query_vector, alpha = 0.25, limit = 2,
    return_metadata=wvc.query.MetadataQuery(score=True)
)
for o in result.objects:
    print(
        o.metadata.score,
        o.properties["text"]
    )

Let me know if that helps!

THanks!