Description
I have created a collection in Weaviate, and ingested some documents into the Weaviate database using LlamaIndex. When I used the default search, I found that it was retrieving wrong documents the whole time. After that, I tested BM25 search, and it was giving high scores to other document, despite copying the entire phrase from the expected document.
Server Setup Information
- Weaviate Server Version: 1.24.10
- Deployment Method: Docker
- LlamaIndex Version: 0.10.42
Document Preparation
Document of interest: downloaded Article from The electricity and metering trends in Singapore | IEEE Conference Publication | IEEE Xplore as PDF and stored locally.
I have other 20 documents to be ingested together for retrieval testing.
Python Setup Information
# Weaviate
import weaviate
from weaviate.classes.config import Configure, VectorDistances, Property, DataType
from weaviate.util import generate_uuid5
from weaviate.query import MetadataQuery
# LlamaIndex
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings, StorageContext
from llama_index.vector_stores.weaviate import WeaviateVectorStore
from llama_index.core.node_parser import SentenceSplitter
# Creating collection
def create_collection(client, collection_name):
client.collections.create(
collection_name,
vectorizer_config=Configure.Vectorizer.text2vec_transformers(),
vector_index_config=Configure.VectorIndex.hnsw(distance_metric=VectorDistances.COSINE)
reranker_config=Configure.Reranker.transformers(),
inverted_index_config=Configure.inverted_index(
bm25_b=0.7,
bm25_k1=1.25,
index_null_state=True,
index_property_length=True,
index_timestamps=True
),
)
# Create index using LlamaIndex
def create_weaviate_index(client, index_name, doc_folder):
create_collection(client, index_name)
vector_store = WeaviateVectorStore(weaviate_client=client, index_name=index_name, text_key="content")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents([], storage_context=storage_context)
documents = SimpleDirectoryReader(input_dir=doc_folder)
nodes = node_parser.get_nodes_from_documents(documents)
index.insert_nodes(nodes)
return index
client = weaviate.connect_to_local()
index_name = "LlamaIndex"
doc_folder = "/path/to/doc_folder"
create_weaviate_index(client, index_name, doc_folder)
Querying with documents
# Using LlamaIndex
query_engine = index.as_query_engine()
question = "EMA was created in 2001 to?" # Took partial string from document
response = query_engine.query(question)
print(response)
for node in response.source_nodes:
print(node.metadata) # Did not retrieve the document that I copied the string from
# Using Weaviate hybrid search, alpha set to 0
collection = client.collections.get("LlamaIndex")
question = "EMA was created in 2001 to?" # Took partial string
query_vector = embed_model.get_query_embedding(question)
response = collection.query.hybrid(
query=question,
vector=query_vector
limit=5,
alpha=0,
return_metadata=MetadataQuery(
distance=True,
certainty=True,
score=True,
explain_score=True
)
)
for obj in response.objects:
print(f"METADATA: {obj.metadata}") # Did not retrieve the document that I copied the string from
# Using Weaviate bm25 search
collection = client.collections.get("LlamaIndex")
question = "EMA was created in 2001 to?" # Took partial string
response = collection.query.bm25(
query=question,
limit=5,
return_metadata=MetadataQuery(
distance=True,
certainty=True,
score=True,
explain_score=True
)
)
for obj in response.objects:
print(f"METADATA: {obj.metadata}") # Did not retrieve the document that I copied the string from
# Using Weaviate near_text search
collection = client.collections.get("LlamaIndex")
question = "EMA was created in 2001 to?" # Took partial string
response = collection.query.near_text(
query=question,
limit=5,
return_metadata=MetadataQuery(
distance=True,
certainty=True,
score=True,
explain_score=True
)
)
for obj in response.objects:
print(f"METADATA: {obj.metadata}") # Did not retrieve the document that I copied the string from