import weaviate
from weaviate.classes.init import Auth
import google.generativeai as genai
from typing import List, Dict
import os
from typing import List, Dict
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain_weaviate.vectorstores import WeaviateVectorStore
from langchain.embeddings import HuggingFaceEmbeddings
from weaviate.classes import query as wvc
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
WEAVIATE_API_KEY = ""
WEAVIATE_URL = ""
gemini_api_key = ""
huggingface_api_key = ""
# Connect to Weaviate Cloud
client = weaviate.connect_to_weaviate_cloud(
cluster_url= WEAVIATE_URL,
auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
)
print(client.is_ready())
huggingface_key = huggingface_api_key
headers = {
"X-HuggingFace-Api-Key": huggingface_key,
}
client = weaviate.connect_to_weaviate_cloud(
cluster_url=WEAVIATE_URL, # `weaviate_url`: Weaviate URL
auth_credentials=Auth.api_key(WEAVIATE_API_KEY), # `weaviate_key`: Weaviate API key
headers=headers
)
# Initialize Gemini
genai.configure(api_key=gemini_api_key)
print("Client is Ready?", client.is_ready())
from weaviate import classes as wvc
client.collections.delete("WikipediaLangChain")
from weaviate.classes.config import Configure
client.collections.create(
"WikipediaLangChain",
vectorizer_config=[
Configure.NamedVectors.text2vec_huggingface(
name="title_vector",
source_properties=["title"],
model="sentence-transformers/all-MiniLM-L6-v2",
)
],
)
embeddings = GoogleGenerativeAIEmbeddings(
model="models/embedding-001", # Google's text embedding model
google_api_key= gemini_api_key
)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=50)
# import first article
loader = PyPDFLoader("brazil-wikipedia-article-text.pdf", extract_images=False)
docs = loader.load_and_split(text_splitter)
print(f"GOT {len(docs)} docs for Brazil")
db = WeaviateVectorStore.from_documents(docs, embeddings, client=client, index_name="WikipediaLangChain")
# import second article
loader = PyPDFLoader("netherlands-wikipedia-article-text.pdf", extract_images=False)
docs = loader.load_and_split(text_splitter)
print(f"GOT {len(docs)} docs for Netherlands")
db = WeaviateVectorStore.from_documents(docs, embeddings, client=client, index_name="WikipediaLangChain")
# Create vector store
vector_store = WeaviateVectorStore(
client=client,
index_name="WikipediaLangChain",
text_key="text",
embedding=embeddings,
attributes=["source"]
)
vector_store.add_documents(docs)
collection = client.collections.get("WikipediaLangChain")
# lets first get our collection
collection = client.collections.get("WikipediaLangChain")
response = collection.aggregate.over_all(total_count=True)
print(response)
# Group by source
response = collection.aggregate.over_all(group_by="source")
for group in response.groups:
print(group.grouped_by.value, group.total_count)
# View object properties
object = collection.query.fetch_objects(limit=1).objects[0]
print(object.properties.keys())
print(object.properties.get("source"))
print(object.properties.get("page"))
print(object.properties.get("text"))
# Query in French using Gemini
generateTask = "Quelle est la nourriture traditionnelle de ce pays?"
source_file = "brazil-wikipedia-article-text.pdf"
model = ChatGoogleGenerativeAI(
model="gemini-pro",
google_api_key= gemini_api_key
)
# lets do a RAG directly using only Weaviate
query = collection.generate.near_text(
query="tradicional food",
limit=10,
grouped_task=generateTask
)
print(query.generated)
AioRpcError Traceback (most recent call last)
File c:\Users\dhanu.conda\envs\idk_gpu\lib\site-packages\weaviate\collections\grpc\query.py:805, in _QueryGRPC.__call(self, request)
804 assert self._connection.grpc_stub is not None
→ 805 res = await _Retry(4).with_exponential_backoff(
806 0,
807 f"Searching in collection {request.collection}",
808 self._connection.grpc_stub.Search,
809 request,
810 metadata=self._connection.grpc_headers(),
811 timeout=self._connection.timeout_config.query,
812 )
813 return cast(search_get_pb2.SearchReply, res)
File c:\Users\dhanu.conda\envs\idk_gpu\lib\site-packages\weaviate\collections\grpc\retry.py:31, in _Retry.with_exponential_backoff(self, count, error, f, *args, kwargs)
30 if e.code() != StatusCode.UNAVAILABLE:
—> 31 raise e
32 logger.info(
33 f"{error} received exception: {e}. Retrying with exponential backoff in {2count} seconds"
34 )
File c:\Users\dhanu.conda\envs\idk_gpu\lib\site-packages\weaviate\collections\grpc\retry.py:28, in _Retry.with_exponential_backoff(self, count, error, f, *args, **kwargs)
27 try:
—> 28 return await f(*args, **kwargs)
29 except AioRpcError as e:
File c:\Users\dhanu.conda\envs\idk_gpu\lib\site-packages\grpc\aio_call.py:327, in _UnaryResponseMixin.await(self)
326 else:
→ 327 raise _create_rpc_error(
328 self._cython_call._initial_metadata,
329 self._cython_call._status,
330 )
331 else:
AioRpcError: <AioRpcError of RPC that terminated with:
status = StatusCode.UNKNOWN
details = “explorer: get class: concurrentTargetVectorSearch): explorer: get class: vector search: object vector search at index wikipedialangchain: shard wikipedialangchain_mj30ETuKNGfK: vector search: knn search: distance between entrypoint and query node: 768 vs 384: vector lengths don’t match”
debug_error_string = “UNKNOWN:Error received from peer {grpc_message:“explorer: get class: concurrentTargetVectorSearch): explorer: get class: vector search: object vector search at index wikipedialangchain: shard wikipedialangchain_mj30ETuKNGfK: vector search: knn search: distance between entrypoint and query node: 768 vs 384: vector lengths don't match”, grpc_status:2, created_time:“2025-01-08T06:29:39.4893321+00:00”}”
During handling of the above exception, another exception occurred:
WeaviateQueryError Traceback (most recent call last)
Cell In[59], line 5
1 # lets do a RAG directly using only Weaviate
----> 5 query = collection.generate.near_text(
6 query=“tradicional food”,
7
8 limit=10,
9 grouped_task=generateTask
10 )
11 print(query.generated)
File c:\Users\dhanu.conda\envs\idk_gpu\lib\site-packages\weaviate\syncify.py:23, in convert..sync_method(self, __new_name, *args, **kwargs)
20 @wraps(method) # type: ignore
21 def sync_method(self, *args, __new_name=new_name, **kwargs):
22 async_func = getattr(cls, __new_name)
—> 23 return _EventLoopSingleton.get_instance().run_until_complete(
24 async_func, self, *args, **kwargs
25 )
File c:\Users\dhanu.conda\envs\idk_gpu\lib\site-packages\weaviate\event_loop.py:42, in _EventLoop.run_until_complete(self, f, *args, **kwargs)
40 raise WeaviateClosedClientError()
41 fut = asyncio.run_coroutine_threadsafe(f(*args, **kwargs), self.loop)
—> 42 return fut.result()
File c:\Users\dhanu.conda\envs\idk_gpu\lib\concurrent\futures_base.py:458, in Future.result(self, timeout)
456 raise CancelledError()
457 elif self._state == FINISHED:
→ 458 return self.__get_result()
459 else:
460 raise TimeoutError()
File c:\Users\dhanu.conda\envs\idk_gpu\lib\concurrent\futures_base.py:403, in Future.__get_result(self)
401 if self._exception:
402 try:
→ 403 raise self._exception
404 finally:
405 # Break a reference cycle with the exception in self._exception
406 self = None
File c:\Users\dhanu.conda\envs\idk_gpu\lib\site-packages\weaviate\collections\queries\near_text\generate.py:101, in _NearTextGenerateAsync.near_text(self, query, single_prompt, grouped_task, grouped_properties, certainty, distance, move_to, move_away, limit, offset, auto_limit, filters, group_by, rerank, target_vector, include_vector, return_metadata, return_properties, return_references)
28 async def near_text(
29 self,
30 query: Union[List[str], str],
(…)
49 return_references: Optional[ReturnReferences[TReferences]] = None,
50 ) → GenerativeSearchReturnType[Properties, References, TProperties, TReferences]:
51 “”“Perform retrieval-augmented generation (RaG) on the results of a by-image object search in this collection using the image-capable vectorization module and vector-based similarity search.
52
53 See the docs for a more detailed explanation.
(…)
99 If the request to the Weaviate server fails.
100 “””
→ 101 res = await self._query.near_text(
102 near_text=query,
103 certainty=certainty,
104 distance=distance,
105 move_to=move_to,
106 move_away=move_away,
107 limit=limit,
108 offset=offset,
109 autocut=auto_limit,
110 filters=filters,
111 group_by=_GroupBy.from_input(group_by),
112 rerank=rerank,
113 target_vector=target_vector,
114 generative=_Generative(
115 single=single_prompt,
116 grouped=grouped_task,
117 grouped_properties=grouped_properties,
118 ),
119 return_metadata=self._parse_return_metadata(return_metadata, include_vector),
120 return_properties=self._parse_return_properties(return_properties),
121 return_references=self._parse_return_references(return_references),
122 )
123 return self._result_to_generative_return(
124 res,
125 _QueryOptions.from_input(
(…)
135 return_references,
136 )
File c:\Users\dhanu.conda\envs\idk_gpu\lib\site-packages\weaviate\collections\grpc\query.py:817, in _QueryGRPC.__call(self, request)
815 if e.code().name == PERMISSION_DENIED:
816 raise InsufficientPermissionsError(e)
→ 817 raise WeaviateQueryError(str(e), “GRPC search”) # pyright: ignore
818 except WeaviateRetryError as e:
819 raise WeaviateQueryError(str(e), “GRPC search”)
WeaviateQueryError: Query call with protocol GRPC search failed with message <AioRpcError of RPC that terminated with:
status = StatusCode.UNKNOWN
details = “explorer: get class: concurrentTargetVectorSearch): explorer: get class: vector search: object vector search at index wikipedialangchain: shard wikipedialangchain_mj30ETuKNGfK: vector search: knn search: distance between entrypoint and query node: 768 vs 384: vector lengths don’t match”
debug_error_string = “UNKNOWN:Error received from peer {grpc_message:“explorer: get class: concurrentTargetVectorSearch): explorer: get class: vector search: object vector search at index wikipedialangchain: shard wikipedialangchain_mj30ETuKNGfK: vector search: knn search: distance between entrypoint and query node: 768 vs 384: vector lengths don't match”, grpc_status:2, created_time:“2025-01-08T06:29:39.4893321+00:00”}”
.