Description
I am trying to deploy a Weaviate cluster using Docker Swarm or Docker Compose, but I’m encountering issues where the nodes fail to join the cluster. The problem seems to be related to network interface selection during the cluster setup, as the nodes are unable to resolve each other’s addresses despite being able to ping one another successfully.
Server Setup Information
- Weaviate Server Version: 1.26.5
- Deployment Method: Docker Compose or Docker Swarm
- Multi Node? Number of Running Nodes: 3 nodes on symmetrical servers
Any additional Information
- Docker network:
Shared network in the swarm
docker network create --driver overlay --subnet=10.11.0.0/16 weaviate_euler_net
- Docker Compose configuration:
version: '3.8'
services:
weaviate-node1:
image: cr.weaviate.io/semitechnologies/weaviate:1.26.5
hostname: weaviate-node1
deploy:
replicas: 1
placement:
constraints: [node.hostname == euler-01]
ports:
- "8080:8080"
- "50051:50051"
environment:
- TRANSFORMERS_INFERENCE_API=http://gpu-server:1000
- QNA_INFERENCE_API=http://gpu-server:2000
- NER_INFERENCE_API=http://gpu-server:3000
- SUM_INFERENCE_API=http://gpu-server:4000
- SPELLCHECK_INFERENCE_API=http://gpu-server:5000
- RERANKER_INFERENCE_API=http://gpu-server:6000
- QUERY_DEFAULTS_LIMIT=25
- PERSISTENCE_DATA_PATH=/var/lib/weaviate
- DEFAULT_VECTORIZER_MODULE=text2vec-transformers
- ENABLE_MODULES=text2vec-transformers,qna-transformers,ner-transformers,sum-transformers,text-spellcheck,reranker-transformers
- AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED=tr ue
- CLUSTER_HOSTNAME=weaviate-node1
- CLUSTER_GOSSIP_BIND_PORT=7100
- CLUSTER_DATA_BIND_PORT=7101
- RAFT_JOIN=weaviate-node1,weaviate-node2,weaviate-node3
- RAFT_BOOTSTRAP_EXPECT=3
volumes:
- weaviate_data_node1:/var/lib/weaviate
networks:
- weaviate_euler_net
weaviate-node2:
image: cr.weaviate.io/semitechnologies/weaviate:1.26.5
hostname: weaviate-node2
deploy:
replicas: 1
placement:
constraints: [node.hostname == euler-02]
ports:
- "8081:8080"
- "50052:50051"
environment:
- TRANSFORMERS_INFERENCE_API=http://gpu-server:1000
- QNA_INFERENCE_API=http://gpu-server:2000
- NER_INFERENCE_API=http://gpu-server:3000
- SUM_INFERENCE_API=http://gpu-server:4000
- SPELLCHECK_INFERENCE_API=http://gpu-server:5000
- RERANKER_INFERENCE_API=http://gpu-server:6000
- QUERY_DEFAULTS_LIMIT=25
- PERSISTENCE_DATA_PATH=/var/lib/weaviate
- DEFAULT_VECTORIZER_MODULE=text2vec-transformers
- ENABLE_MODULES=text2vec-transformers,qna-transformers,ner-transformers,sum-transformers,text-spellcheck,reranker-transformers
- AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED=true
- CLUSTER_HOSTNAME=weaviate-node2
- CLUSTER_GOSSIP_BIND_PORT=7102
- CLUSTER_DATA_BIND_PORT=7103
- CLUSTER_JOIN=weaviate-node1:7100
- RAFT_JOIN=weaviate-node1,weaviate-node2,weaviate-node3
- RAFT_BOOTSTRAP_EXPECT=3
volumes:
- weaviate_data_node2:/var/lib/weaviate
networks:
- weaviate_euler_net
weaviate-node3:
image: cr.weaviate.io/semitechnologies/weaviate:1.26.5
hostname: weaviate-node3
deploy:
replicas: 1
placement:
constraints: [node.hostname == euler-03]
ports:
- "8082:8080"
- "50053:50051"
environment:
- TRANSFORMERS_INFERENCE_API=http://gpu-server:1000
- QNA_INFERENCE_API=http://gpu-server:2000
- NER_INFERENCE_API=http://gpu-server:3000
- SUM_INFERENCE_API=http://gpu-server:4000
- SPELLCHECK_INFERENCE_API=http://gpu-server:5000
- RERANKER_INFERENCE_API=http://gpu-server:6000
- QUERY_DEFAULTS_LIMIT=25
- PERSISTENCE_DATA_PATH=/var/lib/weaviate
- DEFAULT_VECTORIZER_MODULE=text2vec-transformers
- ENABLE_MODULES=text2vec-transformers,qna-transformers,ner-transformers,sum-transformers,text-spellcheck,reranker-transformers
- AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED=true
- CLUSTER_HOSTNAME=node3
- CLUSTER_GOSSIP_BIND_PORT=7104
- CLUSTER_DATA_BIND_PORT=7105
- CLUSTER_JOIN=weaviate-node1:7100
- RAFT_JOIN=weaviate-node1,weaviate-node2,weaviate-node3
- RAFT_BOOTSTRAP_EXPECT=3
volumes:
- weaviate_data_node3:/var/lib/weaviate
networks:
- weaviate_euler_net
volumes:
weaviate_data_node1:
weaviate_data_node2:
weaviate_data_node3:
networks:
weaviate_euler_net:
external: true
- Launch Command:
docker stack deploy --compose-file docker-compose.yaml --with-registry-auth weaviate
- Deployment output:
weaviate_weaviate-node3.1.uos43z7w2kar@euler-03 | {"action":"bootstrap","build_git_commit":"353d907","build_go_version":"go1.22.7","build_image_tag":"1.26.5","build_wv_version":"1.26.5","join_list":{"weaviate-node1":8300,"weaviate-node2":8300,"weaviate-node3":8300},"level":"warning","msg":"unable to resolve any node address to join","time":"2024-10-16T14:52:05Z"}
weaviate_weaviate-node3.1.uos43z7w2kar@euler-03 | {"action":"bootstrap","build_git_commit":"353d907","build_go_version":"go1.22.7","build_image_tag":"1.26.5","build_wv_version":"1.26.5","join_list":{"weaviate-node1":8300,"weaviate-node2":8300,"weaviate-node3":8300},"level":"warning","msg":"unable to resolve any node address to join","time":"2024-10-16T14:52:06Z"}
weaviate_weaviate-node3.1.uos43z7w2kar@euler-03 | {"action":"bootstrap","build_git_commit":"353d907","build_go_version":"go1.22.7","build_image_tag":"1.26.5","build_wv_version":"1.26.5","join_list":{"weaviate-node1":8300,"weaviate-node2":8300,"weaviate-node3":8300},"level":"warning","msg":"unable to resolve any node address to join","time":"2024-10-16T14:52:08Z"}
weaviate_weaviate-node3.1.uos43z7w2kar@euler-03 | {"action":"bootstrap","build_git_commit":"353d907","build_go_version":"go1.22.7","build_image_tag":"1.26.5","build_wv_version":"1.26.5","join_list":{"weaviate-node1":8300,"weaviate-node2":8300,"weaviate-node3":8300},"level":"warning","msg":"unable to resolve any node address to join","time":"2024-10-16T14:52:09Z"}
Meanwhile hosts are resolved on other nodes, so there are clearly in the same network and using the correct network interface:
➜ ~ docker exec -it weaviate_weaviate-node3.1.uos43z7w2kark46wsfjt8o6vc ping weaviate-node1
PING weaviate-node1 (10.11.4.186): 56 data bytes
64 bytes from 10.11.4.186: seq=0 ttl=64 time=0.056 ms
64 bytes from 10.11.4.186: seq=1 ttl=64 time=0.097 ms
64 bytes from 10.11.4.186: seq=2 ttl=64 time=0.099 ms
64 bytes from 10.11.4.186: seq=3 ttl=64 time=0.097 ms
64 bytes from 10.11.4.186: seq=4 ttl=64 time=0.103 ms
64 bytes from 10.11.4.186: seq=5 ttl=64 time=0.097 ms
64 bytes from 10.11.4.186: seq=6 ttl=64 time=0.101 ms
64 bytes from 10.11.4.186: seq=7 ttl=64 time=0.097 ms
^C
--- weaviate-node1 ping statistics ---
8 packets transmitted, 8 packets received, 0% packet loss
round-trip min/avg/max = 0.056/0.093/0.103 ms
I feel like RAFT_JOIN and CLUSTER_HOSTNAME are not correctly resolved with the correct network interface, no matter what hosts you specify it picks another host, so even if you specify the public IP (I know, it’s not good but just for testing) all nodes will try to join with docker network interface. I don’t understand this behavior.
The same logic applies with a simple docker compose file on each node using public/private IPs.