I have 2 questions regarding the new GroupBy functionality with hybrid searches:
Can you use a cross-reference as the property to group by? For example, running a hybrid search on a “DocChunk” collection and grouping by a cross-ref to the parent “Doc”.
If yes, what does the objects_per_group parameter actually do? If I set that to 1, does that mean each group will consist of the highest ranking object in that group of objects? Does that also mean that if I specify objects_per_group=1 and number_of_groups=20, I should expect to get back the top “DocChunk” object for 20 different “Doc” objects?
The only groupBy that will work with cross-reference, for now, is the Aggregate one:
I have crafted this code to play around with it:
import weaviate
from weaviate.util import generate_uuid5
from weaviate import classes as wvc
client = weaviate.connect_to_local()
client.collections.delete("TestCategory")
collection_category = client.collections.create(
"TestCategory",
properties=[
wvc.config.Property(
name="name", data_type=wvc.config.DataType.TEXT),
],
vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(
base_url="http://host.docker.internal:1234"
# If set to "none" you must always provide vectors yourself. Could be any other "text2vec-*" also.
),
generative_config=wvc.config.Configure.Generative.openai(
base_url="http://host.docker.internal:1234"
) # Ensure the `generative-openai` module is used for generative queries
)
client.collections.delete("TestItem")
collection_item = client.collections.create(
"TestItem",
properties=[
wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
wvc.config.Property(name="flat_cat", data_type=wvc.config.DataType.TEXT),
],
references=[
wvc.config.ReferenceProperty(name="category", target_collection="TestCategory"),
],
vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(
base_url="http://host.docker.internal:1234" # using LM Studio. Comment to use openai
),
generative_config=wvc.config.Configure.Generative.openai(
base_url="http://host.docker.internal:1234" # using LM Studio. Comment to use openai
)
)
adding some data
collection_category.data.insert({"name": "Home"}, uuid=generate_uuid5("home"))
collection_category.data.insert({"name": "Car"}, uuid=generate_uuid5("car"))
# car
collection_item.data.insert(
{"title": "Natural Car Smell", "flat_cat": "cat"},
references={"category": generate_uuid5("car")}
)
collection_item.data.insert(
{"title": "Natural Car Chair", "flat_cat": "cat"},
references={"category": generate_uuid5("car")}
)
# home
collection_item.data.insert(
{"title": "Natural Home Smell", "flat_cat": "home"},
references={"category": generate_uuid5("home")}
)
collection_item.data.insert(
{"title": "Natural Home plants", "flat_cat": "home"},
references={"category": generate_uuid5("home")}
)
group using aggregate:
query = collection_item.aggregate.over_all(
group_by=wvc.aggregate.GroupByAggregate(prop="category")
)
for group in query.groups:
print(group)
this will not work:
query = collection_item.query.hybrid(
query="nature",
group_by=wvc.query.GroupBy(prop="category", objects_per_group=1, number_of_groups=2)
)
for object in query.objects:
print(object)
this will work. Note that I am using flat_cat property as the property to group by
query = collection_item.query.hybrid(
query="nature",
group_by=wvc.query.GroupBy(prop="flat_cat", objects_per_group=1, number_of_groups=2)
)
for object in query.objects:
print(object)
Also, please, feel free to voice your feature request on this issue so we can channel all discussions on this topic there: