Hi, as the title says, I am attempting to judge the OpenAI Ada Embedding model and SBERTs MiniLM model to see which one results in better search results and scores. I have created two classes with identical objects but different vectorizers: one with OpenAI and one with no vectorizer (I add the vectors during object upload).
My assumption is that when I do a hybrid search for one query in these two different classes, I should get ~~similar but not the same scores on a returned , given I’m using different vectorizers. However that is not the case: for a hybrid search on the same query on these two different classes, I get the same scores. As this intended behaviour or am I doing something wrong?
My end goal is to evaluate which embedding model performs better.
Here’s the schema and code for the same:
class_obj_openai = {
'class': 'openai',
'properties': [
{
'name': 'title',
'dataType': ['text']
},
{
'name': 'source',
'dataType': ['text']
},
{
'name': 'content',
'dataType': ['text']
},
],
'vectorizer': 'text2vec-openai',
'moduleConfig': {
'tect2vec-openai': {
'vectorizeClassName': False,
'model': 'ada',
'modelVersion': '002',
'type': 'text'
}
}
}
class_obj_minilm = {
'class': 'minilm',
'properties': [
{
'name': 'title',
'dataType': ['text']
},
{
'name': 'source',
'dataType': ['text']
},
{
'name': 'content',
'dataType': ['text']
},
],
'vectorizer': 'none'
}
client.schema.create_class(class_obj_minilm)
client.schema.create_class(class_obj_openai)
# Import data into MiniLM class
with client.batch(batch_size=100, num_workers=10) as batch:
# Batch import all Questions
for i, d in enumerate(entries):
print(f"importing question: {i+1}")
properties = {
"title": d["metadata"]["title"],
"source": d["metadata"]["source"],
"content": d["page_content"],
}
client.batch.add_data_object(properties, "minilm", vector=d["vector"])
# Import data into OpenAI class
with client.batch(batch_size=100, num_workers=1) as batch:
# Batch import all Questions
for i, d in enumerate(entries):
print(f"importing question: {i+1}")
properties = {
"title": d["metadata"]["title"],
"source": d["metadata"]["source"],
"content": d["page_content"],
}
client.batch.add_data_object(properties, "openai")