Hi everyone! Thanks for this amazing product. I’m currently testing it out, but I’m encountering a few issues.
Description
When I attempt to batch import using dynamic batching, I encounter issues feeding the data into the vector database. I discovered this was due to a hardcoded gRPC buffer size. As a workaround, I switched to fixed batching, but it doesn’t seem to be optimized.
Another problem I’m facing is with vectorization. When I use the client method fetch_object_by_id(..., include_vector=...)
, it returns an empty vector. This makes searching impossible as it results in inconsistent objects.
Also the score when i perform a near_text query is always 0.0:
from weaviate.classes.query import Rerank, MetadataQuery
response=pages_collection.query.near_text(
query="Sample Query",
limit=3,
return_metadata=MetadataQuery(score=True,distance=True,certainty=True)
)
# Print first result
result = response.objects[0]
print(result)
Server Setup Information
- Weaviate Server Version: 1.26.5
- Deployment Method: docker
- Multi Node? Number of Running Nodes: 1
- Client Language and Version: Python Client 4.8.1
- Multitenancy?: False
Any additional Information
Here are the collections I’ve created:
documents=client.collections.create(
name="Documents",
properties=[
Property(name="description",data_type=wvc.config.DataType.TEXT),
],
vectorizer_config=Configure.Vectorizer.multi2vec_clip(
text_fields=[
Multi2VecField(name="description", weight=1.0)
]
),
vector_index_config=Configure.VectorIndex.dynamic(),
reranker_config=Configure.Reranker.transformers()
)
pages=client.collections.create(
name="Pages",
properties=[
Property(name="identifier",data_type=DataType.TEXT,index_searchable=False,index_filterable=False),
Property(name="description",data_type=DataType.TEXT),
Property(name="page",data_type=DataType.BLOB)
],
references=[
ReferenceProperty(
name="belongsTo",
target_collection="Documents"
),
],
vectorizer_config=Configure.Vectorizer.multi2vec_clip(
text_fields=[
Multi2VecField(name="description", weight=.5)
],
image_fields=[Multi2VecField(name="page", weight=.5)],
),
vector_index_config=Configure.VectorIndex.dynamic(),
reranker_config=Configure.Reranker.transformers()
)
chunks=client.collections.create(
name="Chunks",
properties=[
Property(name="identifier",data_type=DataType.TEXT,index_searchable=False,index_filterable=False),
Property(name="description",data_type=DataType.TEXT),
Property(name="chunk",data_type=DataType.BLOB)
],
references=[
ReferenceProperty(
name="belongsTo",
target_collection="Pages"
)
],
vectorizer_config=Configure.Vectorizer.multi2vec_clip(
text_fields=[
Multi2VecField(name="description", weight=.5)
],
image_fields=[Multi2VecField(name="chunk", weight=.5)]
),
vector_index_config=Configure.VectorIndex.dynamic(),
reranker_config=Configure.Reranker.transformers()
)
pages=client.collections.get("Pages")
pages.config.add_reference(ReferenceProperty(name="hasChunks", target_collection="Chunks"))
Example of data import:
with pages_collection.batch.fixed_size(18,10) as batch:
for page in pages_row:
id=page.pop("uuid")
batch.add_object(properties=page, uuid=id)
if batch.number_errors>0:
print(batch.failed_objects)
Retrieve by id:
import weaviate
from weaviate.classes.query import Rerank, MetadataQuery
client=weaviate.connect_to_local()
try:
pages_collection=client.collections.get("Pages")
object=pages_collection.query.fetch_object_by_id(
uuid="04b62467-3693-5019-b46f-0e435d00273a",
include_vector="pages_vector"
)
print(f"Vector \'pages_vector\' :{object.vector}")
finally:
client.close()
services:
weaviate:
command:
- --host
- 0.0.0.0
- --port
- '8080'
- --scheme
- http
image: cr.weaviate.io/semitechnologies/weaviate:1.26.5
ports:
- 8080:8080
- 50051:50051
volumes:
- weaviate_data:/var/lib/weaviate
restart: on-failure:0
environment:
CLIP_INFERENCE_API: 'http://multi2vec-clip:8080'
QNA_INFERENCE_API: 'http://qna-transformers:8080'
RERANKER_INFERENCE_API: 'http://reranker-transformers:8080'
QUERY_DEFAULTS_LIMIT: 25
AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
DEFAULT_VECTORIZER_MODULE: 'multi2vec-clip'
ENABLE_MODULES: 'multi2vec-clip,qna-transformers,reranker-transformers'
CLUSTER_HOSTNAME: 'node1'
ASYNC_INDEXING: 'true'
multi2vec-clip:
image: semitechnologies/multi2vec-clip:sentence-transformers-clip-ViT-B-32-multilingual-v1
environment:
ENABLE_CUDA: '1'
NVIDIA_VISIBLE_DEVICES: 'all'
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
qna-transformers:
image: semitechnologies/qna-transformers:distilbert-base-cased-distilled-squad
environment:
ENABLE_CUDA: '1'
NVIDIA_VISIBLE_DEVICES: 'all' deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
reranker-transformers:
image: semitechnologies/reranker-transformers:cross-encoder-ms-marco-MiniLM-L-6-v2
environment:
ENABLE_CUDA: '1'
NVIDIA_VISIBLE_DEVICES: 'all'
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
volumes:
weaviate_data: