Description
I am using near_vector() for similarity search and I am getting the following error:
File “C:\Users\ikim1\RAG-blog\lib\site-packages\weaviate\validator.py”, line 61, in
return all(isinstance(val, args[0]) for val in value)
TypeError: isinstance() arg 2 must be a type or tuple of types
Server Setup Information
- Weaviate Server Version: Using Local Docker hosting. image: cr.weaviate.io/semitechnologies/weaviate:1.28.3
- Deployment Method: Docker (straight from the Weaviate website on how to Create Local Docker instance)
- Multi Node? Number of Running Nodes: 1
- Client Language and Version: Sorry… I am a software newbie…Not sure what these mean. Python 3.11?
- Multitenancy?:
Any additional Information
I have used the following method to embed the text to float list:
embedding_model = HuggingFaceEmbeddings(model_name = ‘sentence-transformers/all-mpnet-base-v2’)
embedded_chunk = embedding_model.embed_documents(chunk)[0]
chunks_with_metadata_list.append(wvc.data.DataObject(
properties = chunk_metadata,
vector = embedded_chunk
))
Where the chunks are made with text_splitter from LangChain.
This is the full code:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
import weaviate
import weaviate.classes as wvc
import weaviate.classes.config as wc
import json
import os
client = weaviate.connect_to_local()
# check if the client is alive
assert client.is_live()
# delete the existing schema + create schema
client.collections.delete("AutomotiveFinance")
client.collections.create(
name = 'AutomotiveFinance',
properties=[
wc.Property(name = 'page1', data_type = wc.DataType.INT),
wc.Property(name = 'page2', data_type = wc.DataType.INT),
wc.Property(name = 'company', data_type = wc.DataType.TEXT),
wc.Property(name = 'doc_type', data_type = wc.DataType.TEXT),
wc.Property(name = 'raw_text', data_type = wc.DataType.TEXT),
],
vectorizer_config=wvc.config.Configure.Vectorizer.none(),
)
auto_finance = client.collections.get("AutomotiveFinance")
# get the path of each json file
json_top_file_path = r'C:\Users\ikim1\OneDrive\Desktop\RAG file'
json_file_path = []
for file in os.listdir(json_top_file_path):
if file.endswith('.json'):
file_path = os.path.join(json_top_file_path, file)
json_file_path.append(file_path)
# Initialize the text splitter + embedding model
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500, # Maximum size of each chunk
chunk_overlap=100 # Overlap between consecutive chunks
)
embedding_model = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-mpnet-base-v2')
# for each file path of json get the chunks.
chunks_with_metadata_list = []
for one_json_path in json_file_path:
# each json had the following structure
# json['pages'], json['file_path'], json['company'] json['doc_type']
# in json['pages'], there is list of each page as element
# open the json file
with open(one_json_path, 'r') as file:
json_data = json.load(file)
pages = json_data['pages']
company = json_data['company']
doc_type = json_data['doc_type']
# make the entire string from the pages
# make sure to insert the page numbers as well.
old_page_num = 0; old_md = ''; old_raw_txt = ''
json_string = ''
for i, page in enumerate(pages):
md = page['md']
raw_txt = page['text']
page_num = page['page']
print(i)
# if this is the second one, then start the chunking process
if i > 0:
old_combined_str = "THIS IS PAGE " + str(old_page_num) + '\n' + old_md + '\n' + old_raw_txt
new_combined_str = "THIS IS PAGE " + str(page_num) + '\n' + md + '\n' + raw_txt
combined_str = new_combined_str + '\n' + old_combined_str
# chunk the combined_str using recursive splittin,g but inject the metadata.
chunks = text_splitter.split_text(combined_str)
# inject the metadata into the chunks
for chunk in chunks:
# embed the chunk : output is already a list. so no need for conversion for Weaviate
embedded_chunk = embedding_model.embed_documents(chunk)[0]
chunk_metadata = {
"page1" : old_page_num, "page2" : page_num,
"company" : company, "doc_type" : doc_type,
'raw_text' : chunk
}
chunks_with_metadata_list.append(wvc.data.DataObject(
properties = chunk_metadata,
vector = embedded_chunk
))
# cache the previous one
old_md = md
old_raw_txt = raw_txt
old_page_num = page_num
auto_finance.data.insert_many(chunks_with_metadata_list)
query_vector = embedding_model.embed_documents("what is Honda Cash and cash equivalents?")[0]
import time
time.sleep(2)
response = auto_finance.query.near_vector(
near_vector = query_vector,
limit = 4
)
for o in response.objects:
print(o.properties['raw_text'])
print(o.properties['company'])
print(o.properties['page1'])
print(o.properties['page2'])
client.close()
Then I followed this section to do near_vector() search:
from weaviate.classes.query import MetadataQuery
import weaviate
from langchain_huggingface import HuggingFaceEmbeddings
import numpy as np
import base64
embedding_model = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-mpnet-base-v2')
client = weaviate.connect_to_local()
auto_finance= client.collections.get("AutomotiveFinance")
query_vector = embedding_model.embed_documents("what is Honda Cash and cash equivalents?")[0]
# I checked query_Vector is list of floats
import time
time.sleep(2)
response = auto_finance.query.near_vector(
near_vector = query_vector,
limit = 4
)
for o in response.objects:
print(o.properties['raw_text'])
print(o.properties['company'])
print(o.properties['page1'])
print(o.properties['page2'])
client.close()
But I am getting the following error:
Traceback (most recent call last):
File "c:/Users/ikim1/OneDrive/Desktop/RAG file/SimSearch.py", line 16, in <module>
response = auto_finance.query.near_vector(
File "C:\Users\ikim1\RAG-blog\lib\site-packages\weaviate\syncify.py", line 23, in sync_method
return _EventLoopSingleton.get_instance().run_until_complete(
File "C:\Users\ikim1\RAG-blog\lib\site-packages\weaviate\event_loop.py", line 40, in run_until_complete
return fut.result()
File "C:\Users\ikim1\AppData\Local\Programs\Python38\lib\concurrent\futures\_base.py", line 439, in result
return self.__get_result()
File "C:\Users\ikim1\AppData\Local\Programs\Python38\lib\concurrent\futures\_base.py", line 388, in __get_result
raise self._exception
File "C:\Users\ikim1\RAG-blog\lib\site-packages\weaviate\collections\queries\near_vector\query.py", line 92, in near_vector
res = await self._query.near_vector(
File "C:\Users\ikim1\RAG-blog\lib\site-packages\weaviate\collections\grpc\query.py", line 361, in near_vector
_validate_input(
File "C:\Users\ikim1\RAG-blog\lib\site-packages\weaviate\validator.py", line 31, in _validate_input
if not any(_is_valid(exp, validate.value) for exp in validate.expected):
File "C:\Users\ikim1\RAG-blog\lib\site-packages\weaviate\validator.py", line 31, in <genexpr>
if not any(_is_valid(exp, validate.value) for exp in validate.expected):
File "C:\Users\ikim1\RAG-blog\lib\site-packages\weaviate\validator.py", line 61, in _is_valid
return all(isinstance(val, args[0]) for val in value)
File "C:\Users\ikim1\RAG-blog\lib\site-packages\weaviate\validator.py", line 61, in <genexpr>
return all(isinstance(val, args[0]) for val in value)
TypeError: isinstance() arg 2 must be a type or tuple of types
I think i followed every steps - if not. please let me know and I can test them out. But from what I have seen, args[0] is outputting ~T because the “expected” is typing.List. Not really sure why it is coded like this but also please let me know Thanks!