Hello, I would like to know if there are any good options or strategies for “penalizing” objects based on how common a specific TEXT property is among the results.
My collection consists of c.a. 35 000 objects, each object represents a statistical dataset. Each object has a “label” property, which is the title of the dataset used to generate the vector.
Each object also has a “provider_id” property, which is the name of the organization that provided the dataset. There are about 15 different providers in the collection.
My problem is that when I query (hybrid or semantic) for a dataset, the results are often dominated by datasets from the same provider. Often these have very similar titles, so the results are not very diverse.
I would like to know if there is a way to penalize objects based on how common the “provider_id” is among the results.
My idea would be to add a weight of some kind, so that if the first result provider is “A” → then the next result with provider “A” would have a penalty of 0.9 and then the next 0.9^2, and so on.
Is there a way to do this in Weaviate? Or do you have any other suggestions on how to approach this problem?
- My weaviate cluster is a serverless Weaviate Cloud instance.
- I use the python client to query the Weaviate instance.
Thank you in advance!
Here is the schema of the collection:
{
"invertedIndexConfig": {
"bm25": {"b": 0.75, "k1": 1.2},
"cleanupIntervalSeconds": 60,
"indexNullState": false,
"indexPropertyLength": false,
"indexTimestamps": false,
"stopwords": {"preset": "en"},
},
"multiTenancyConfig": {
"enabled": false,
"autoTenantCreation": false,
"autoTenantActivation": false,
},
"properties": [
{
"name": "dataset_id",
"description": "The id of the dataset.",
"dataType": ["int"],
"indexFilterable": false,
"indexSearchable": false,
"indexRangeFilters": false,
"tokenization": null,
"moduleConfig": {"none": {}},
},
{
"name": "provider_id",
"description": "The provider id of the table.",
"dataType": ["text"],
"indexFilterable": true,
"indexSearchable": true,
"indexRangeFilters": false,
"tokenization": "word",
"moduleConfig": {"none": {}},
},
{
"name": "label",
"description": "The title of the table.",
"dataType": ["text"],
"indexFilterable": true,
"indexSearchable": true,
"indexRangeFilters": false,
"tokenization": "word",
"moduleConfig": {"none": {}},
},
{
"name": "dimensions",
"description": "The dimension/variable names for the table.",
"dataType": ["text[]"],
"indexFilterable": true,
"indexSearchable": true,
"indexRangeFilters": false,
"tokenization": "word",
"moduleConfig": {"none": {}},
},
{
"name": "uuid",
"description": "This property was generated by Weaviate's auto-schema feature on Fri Dec 6 20:58:14 2024",
"dataType": ["uuid"],
"indexFilterable": true,
"indexSearchable": false,
"indexRangeFilters": false,
"tokenization": null,
"moduleConfig": {"none": {}},
},
],
"replicationConfig": {
"factor": 1,
"asyncEnabled": false,
"deletionStrategy": "NoAutomatedResolution",
},
"shardingConfig": {
"virtualPerPhysical": 128,
"desiredCount": 1,
"actualCount": 1,
"desiredVirtualCount": 128,
"actualVirtualCount": 128,
"key": "_id",
"strategy": "hash",
"function": "murmur3",
},
"vectorConfig": {
"default": {
"vectorizer": {
"text2vec-openai": {
"baseURL": "https://api.openai.com",
"dimensions": 3072,
"model": "text-embedding-3-large",
"vectorizeClassName": true,
"properties": null,
}
},
"vectorIndexConfig": {
"cleanupIntervalSeconds": 300,
"distanceMetric": "cosine",
"dynamicEfMin": 100,
"dynamicEfMax": 500,
"dynamicEfFactor": 8,
"ef": -1,
"efConstruction": 128,
"filterStrategy": "sweeping",
"flatSearchCutoff": 40000,
"maxConnections": 32,
"skip": false,
"vectorCacheMaxObjects": 1000000000000,
},
"vectorIndexType": "hnsw",
}
},
"class": "Datasets",
"moduleConfig": {"reranker-cohere": {"model": "rerank-v3.5"}},
}