Hello Weaviate Support Team,
I’m encountering an issue when inserting data into a multi-node Weaviate cluster using Spark. I am successfully able to insert data into other tables, but when I attempt to insert data into the Ebright table, I receive the following error:
WeaviateErrorMessage(message=local index "Ebright" not found: deadline exceeded for waiting for update: version got=3 want=108, throwable=null)
The error occurs when writing batches of data using the following Spark job:
spark = SparkSession.getActiveSession()
df = spark.createDataFrame(documents)
df.write.format("io.weaviate.spark.Weaviate") \
.option("batchSize", 200) \
.option("scheme", "http") \
.option("host", weaviate_url) \
.option("grpc:host", "10.2.0.13:50051") \
.option("grpc:secured", "false") \
.option("className", weaviate_table_name) \
.mode("append").save()
The schema for the Ebright table is as follows:
properties = [
Property(name="chunk", data_type=DataType.TEXT, vectorize_property_name=False),
Property(name="chunk_id", data_type=DataType.INT, vectorize_property_name=False),
Property(name="chunk_unique_id", data_type=DataType.TEXT, vectorize_property_name=False),
Property(name="abs_url", data_type=DataType.TEXT, vectorize_property_name=False, skip_vectorization=True),
Property(name="file_name", data_type=DataType.TEXT, vectorize_property_name=False, skip_vectorization=True),
Property(name="server_relative_url", data_type=DataType.TEXT, vectorize_property_name=False, skip_vectorization=True),
Property(name="topic", data_type=DataType.TEXT, vectorize_property_name=False, skip_vectorization=True),
Property(name="time_created", data_type=DataType.DATE, vectorize_property_name=False, skip_vectorization=True),
Property(name="time_lastmodified", data_type=DataType.DATE, vectorize_property_name=False, skip_vectorization=True),
Property(name="unique_id", data_type=DataType.TEXT, vectorize_property_name=False, skip_vectorization=True),
]
Cluster Setup:
- 3-node cluster.
- Using Weaviate version 1.26.6.
- Spark for batch data insertion.
- Weaviate Python client 4.9.0.
I can see the collection exists already, so its strange that the error says index not found:
test_col = client.collections.get("Ebright")
print(test_col)
<weaviate.Collection config={
"name": "Ebright",
"description": null,
"generative_config": null,
"inverted_index_config": {
"bm25": {
"b": 0.75,
"k1": 1.2
},
"cleanup_interval_seconds": 60,
"index_null_state": false,
"index_property_length": false,
"index_timestamps": false,
"stopwords": {
"preset": "en",
"additions": null,
"removals": null
}
},
"multi_tenancy_config": {
"enabled": false,
"auto_tenant_creation": false,
"auto_tenant_activation": false
},
"properties": [
{
"name": "chunk",
"description": null,
"data_type": "text",
"index_filterable": true,
"index_range_filters": false,
"index_searchable": true,
"nested_properties": null,
"tokenization": "word",
"vectorizer_config": {
"skip": false,
"vectorize_property_name": false
},
"vectorizer": "text2vec-transformers"
},
{
"name": "chunk_id",
"description": null,
"data_type": "int",
"index_filterable": true,
"index_range_filters": false,
"index_searchable": false,
"nested_properties": null,
"tokenization": null,
"vectorizer_config": {
"skip": false,
"vectorize_property_name": false
},
"vectorizer": "text2vec-transformers"
},
{
"name": "chunk_unique_id",
"description": null,
"data_type": "text",
"index_filterable": true,
"index_range_filters": false,
"index_searchable": true,
"nested_properties": null,
"tokenization": "word",
"vectorizer_config": {
"skip": false,
"vectorize_property_name": false
},
"vectorizer": "text2vec-transformers"
},
{
"name": "abs_url",
"description": null,
"data_type": "text",
"index_filterable": true,
"index_range_filters": false,
"index_searchable": true,
"nested_properties": null,
"tokenization": "word",
"vectorizer_config": {
"skip": true,
"vectorize_property_name": false
},
"vectorizer": "text2vec-transformers"
},
{
"name": "file_name",
"description": null,
"data_type": "text",
"index_filterable": true,
"index_range_filters": false,
"index_searchable": true,
"nested_properties": null,
"tokenization": "word",
"vectorizer_config": {
"skip": true,
"vectorize_property_name": false
},
"vectorizer": "text2vec-transformers"
},
{
"name": "server_relative_url",
"description": null,
"data_type": "text",
"index_filterable": true,
"index_range_filters": false,
"index_searchable": true,
"nested_properties": null,
"tokenization": "word",
"vectorizer_config": {
"skip": true,
"vectorize_property_name": false
},
"vectorizer": "text2vec-transformers"
},
{
"name": "topic",
"description": null,
"data_type": "text",
"index_filterable": true,
"index_range_filters": false,
"index_searchable": true,
"nested_properties": null,
"tokenization": "word",
"vectorizer_config": {
"skip": true,
"vectorize_property_name": false
},
"vectorizer": "text2vec-transformers"
},
{
"name": "time_created",
"description": null,
"data_type": "date",
"index_filterable": true,
"index_range_filters": false,
"index_searchable": false,
"nested_properties": null,
"tokenization": null,
"vectorizer_config": {
"skip": true,
"vectorize_property_name": false
},
"vectorizer": "text2vec-transformers"
},
{
"name": "time_lastmodified",
"description": null,
"data_type": "date",
"index_filterable": true,
"index_range_filters": false,
"index_searchable": false,
"nested_properties": null,
"tokenization": null,
"vectorizer_config": {
"skip": true,
"vectorize_property_name": false
},
"vectorizer": "text2vec-transformers"
},
{
"name": "unique_id",
"description": null,
"data_type": "text",
"index_filterable": true,
"index_range_filters": false,
"index_searchable": true,
"nested_properties": null,
"tokenization": "word",
"vectorizer_config": {
"skip": true,
"vectorize_property_name": false
},
"vectorizer": "text2vec-transformers"
}
],
"references": [],
"replication_config": {
"factor": 1,
"async_enabled": false,
"deletion_strategy": "NoAutomatedResolution"
},
"reranker_config": null,
"sharding_config": {
"virtual_per_physical": 128,
"desired_count": 3,
"actual_count": 3,
"desired_virtual_count": 384,
"actual_virtual_count": 384,
"key": "_id",
"strategy": "hash",
"function": "murmur3"
},
"vector_index_config": {
"quantizer": null,
"cleanup_interval_seconds": 300,
"distance_metric": "cosine",
"dynamic_ef_min": 100,
"dynamic_ef_max": 500,
"dynamic_ef_factor": 8,
"ef": -1,
"ef_construction": 128,
"filter_strategy": "sweeping",
"flat_search_cutoff": 40000,
"max_connections": 32,
"skip": false,
"vector_cache_max_objects": 1000000000000
},
"vector_index_type": "hnsw",
"vectorizer_config": {
"vectorizer": "text2vec-transformers",
"model": {
"poolingStrategy": "masked_mean"
},
"vectorize_collection_name": false
},
"vectorizer": "text2vec-transformers",
"vector_config": null
}>
Could you please help me diagnose this issue or provide suggestions on how to resolve the index update problem? Is there any specific configuration or tuning required for multi-node cluster setups in this context?
Thank you in advance!