Unable to get expected results using BM25 or any search functions

Description

I have created a collection in Weaviate, and ingested some documents into the Weaviate database using LlamaIndex. When I used the default search, I found that it was retrieving wrong documents the whole time. After that, I tested BM25 search, and it was giving high scores to other document, despite copying the entire phrase from the expected document.

Server Setup Information

  • Weaviate Server Version: 1.24.10
  • Deployment Method: Docker
  • LlamaIndex Version: 0.10.42

Document Preparation

Document of interest: downloaded Article from The electricity and metering trends in Singapore | IEEE Conference Publication | IEEE Xplore as PDF and stored locally.
I have other 20 documents to be ingested together for retrieval testing.

Python Setup Information

# Weaviate
import weaviate
from weaviate.classes.config import Configure, VectorDistances, Property, DataType
from weaviate.util import generate_uuid5
from weaviate.query import MetadataQuery

# LlamaIndex
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings, StorageContext
from llama_index.vector_stores.weaviate import WeaviateVectorStore
from llama_index.core.node_parser import SentenceSplitter
# Creating collection
def create_collection(client, collection_name):
  client.collections.create(
    collection_name,
    vectorizer_config=Configure.Vectorizer.text2vec_transformers(),
    vector_index_config=Configure.VectorIndex.hnsw(distance_metric=VectorDistances.COSINE)
    reranker_config=Configure.Reranker.transformers(),
    inverted_index_config=Configure.inverted_index(
      bm25_b=0.7,
      bm25_k1=1.25,
      index_null_state=True,
      index_property_length=True,
      index_timestamps=True
    ),
  )
 
# Create index using LlamaIndex
def create_weaviate_index(client, index_name, doc_folder):
  create_collection(client, index_name)
  vector_store = WeaviateVectorStore(weaviate_client=client, index_name=index_name, text_key="content")
  storage_context = StorageContext.from_defaults(vector_store=vector_store)
  index = VectorStoreIndex.from_documents([], storage_context=storage_context)
  documents = SimpleDirectoryReader(input_dir=doc_folder)
  nodes = node_parser.get_nodes_from_documents(documents)
  index.insert_nodes(nodes)
  return index

client = weaviate.connect_to_local()
index_name = "LlamaIndex"
doc_folder = "/path/to/doc_folder"
create_weaviate_index(client, index_name, doc_folder)

Querying with documents

# Using LlamaIndex
query_engine = index.as_query_engine()
question = "EMA was created in 2001 to?" # Took partial string from document
response = query_engine.query(question)
print(response)

for node in response.source_nodes:
  print(node.metadata) # Did not retrieve the document that I copied the string from
# Using Weaviate hybrid search, alpha set to 0
collection = client.collections.get("LlamaIndex")
question = "EMA was created in 2001 to?" # Took partial string 
query_vector = embed_model.get_query_embedding(question)

response = collection.query.hybrid(
  query=question,
  vector=query_vector
  limit=5,
  alpha=0,

  return_metadata=MetadataQuery(
    distance=True,
    certainty=True,
    score=True,
    explain_score=True
  )
)

for obj in response.objects:
  print(f"METADATA: {obj.metadata}") # Did not retrieve the document that I copied the string from
# Using Weaviate bm25 search
collection = client.collections.get("LlamaIndex")
question = "EMA was created in 2001 to?" # Took partial string 
response = collection.query.bm25(
  query=question,
  limit=5,

  return_metadata=MetadataQuery(
    distance=True,
    certainty=True,
    score=True,
    explain_score=True
  )
)

for obj in response.objects:
  print(f"METADATA: {obj.metadata}") # Did not retrieve the document that I copied the string from
# Using Weaviate near_text search
collection = client.collections.get("LlamaIndex")
question = "EMA was created in 2001 to?" # Took partial string 
response = collection.query.near_text(
  query=question,
  limit=5,

  return_metadata=MetadataQuery(
    distance=True,
    certainty=True,
    score=True,
    explain_score=True
  )
)

for obj in response.objects:
  print(f"METADATA: {obj.metadata}") # Did not retrieve the document that I copied the string from

Hi!

I was not able to have access to that pdf.

I will try to reproduce this, and it would be interesting to use the very same pdf.

Thanks!

1 Like

Hi there!

Here you go my findings based on your code.

Let me know if this works for you.

Not sure what is the vectorizer you are using.

here, when you do not provide the vectors using llama_index, Weaviate will vectorize it for you. Not sure also if you don’t want vectors.

#!pip3 install -U weaviate-client llama_index llama-index-readers-file llama-index-embeddings-openai

# Weaviate
import weaviate
from weaviate.classes.config import Configure, VectorDistances, Property, DataType
from weaviate.util import generate_uuid5
from weaviate.classes.query import MetadataQuery

# LlamaIndex
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings, StorageContext
from llama_index.vector_stores.weaviate import WeaviateVectorStore
from llama_index.core.node_parser import SentenceSplitter

from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

import os
import openai

#os.environ["OPENAI_API_KEY"] = ""
openai.api_key = os.environ["OPENAI_API_KEY"]

embed_model = OpenAIEmbedding(embed_batch_size=10)
Settings.embed_model = embed_model


# lets test out llamaindex embedd model
from llama_index.embeddings.openai import OpenAIEmbedding

embed_model = OpenAIEmbedding(model="text-embedding-3-small")

embeddings = embed_model.get_text_embedding(
    "Open AI new Embeddings models is great."
)

print(embeddings[:5])


# Creating a Weaviate collection
def create_collection(client, collection_name):
    client.collections.create(
        collection_name,
        generative_config=Configure.Generative.openai(),
        vectorizer_config=Configure.Vectorizer.text2vec_openai(model="text-embedding-3-small"),
        vector_index_config=Configure.VectorIndex.hnsw(distance_metric=VectorDistances.COSINE),
        reranker_config=Configure.Reranker.transformers(),
        inverted_index_config=Configure.inverted_index(
        bm25_b=0.7,
        bm25_k1=1.25,
        index_null_state=True,
        index_property_length=True,
        index_timestamps=True
        ),
    )
 
# Create index using LlamaIndex
def create_weaviate_index(client, index_name, doc_folder):
    create_collection(client, index_name)
    vector_store = WeaviateVectorStore(weaviate_client=client, index_name=index_name, text_key="content")
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    index = VectorStoreIndex.from_documents([], storage_context=storage_context)
    documents = SimpleDirectoryReader(input_dir=doc_folder).load_data()
    node_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=20)


    nodes = node_parser.get_nodes_from_documents(
        documents, show_progress=False
    )    
    index.insert_nodes(nodes)
    return index

client = weaviate.connect_to_local()
index_name = "LlamaIndex"
#
# WARNING THIS WILL DELETE IF EXISTS
#
client.collections.delete(index_name)
doc_folder = "./pdfs"
create_weaviate_index(client, index_name, doc_folder)

# querying
collection = client.collections.get("LlamaIndex")
collection.query.fetch_objects(include_vector=True, limit=1).objects[0].vector

# querying Weaviate directly
collections = client.collections.get("LlamaIndex")
for object in collections.query.bm25("food").objects:
    print(object.properties)

vector_store = WeaviateVectorStore(weaviate_client=client, index_name=index_name, text_key="content")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents([], storage_context=storage_context)
query_engine = index.as_query_engine()

#filtering
from llama_index.core.vector_stores import (
    MetadataFilter,
    MetadataFilters,
    FilterOperator,
)

filters = MetadataFilters(
    filters=[
        MetadataFilter(key="file_name", operator=FilterOperator.EQ, value="brazil"),
    ]
)

retriever = index.as_retriever(filters=filters)
retriever.retrieve("What is the traditional food of this country?")

# generating an answer
from llama_index.core.vector_stores import ExactMatchFilter, MetadataFilters
from IPython.display import Markdown, display
filters = MetadataFilters(
    filters=[ExactMatchFilter(key="file_name", value="netherlands")]
)
query_engine = index.as_query_engine(filters=filters)
response = query_engine.query("What is the food of this country?")
print("{response}")
1 Like

Hi @DudaNogueira, I found out that some of the documents were not properly tokenized when converting documents to node chunks, thus unable to appear in the results.

For the filters I want to use “like” and not exact match. When I used the ExactMatchFilter, it returned no results while not using the filter returned results. May I know if there is a equivalent way to perform filtering in llamaindex the same as below?

 response = collection.query.hybrid(
   query=question,
   vector=query_vector
   limit=5,
   alpha=0.2,
   filters=Filter.by_property("content").like("test"),
   return_metadata=MetadataQuery(
     distance=True,
     certainty=True,
     score=True,
     explain_score=True
   )
 )

Hi!

Glad you figured it out!!

You can just adapt the code above, like so

# generating an answer
from llama_index.core.vector_stores import ExactMatchFilter, MetadataFilters
from IPython.display import Markdown, display
filters = MetadataFilters(
    filters=[Filter.by_property("content").like("*test*")]
)
query_engine = index.as_query_engine(filters=filters)
response = query_engine.query("What is the food of this country?")
print("{response}")

Let me know if this helps!

1 Like

Sorry but this doesn’t work, running this gave me the following error:

filters = MetadataFilters(
    filters=[Filter.by_property("content").like("*test*")]
)
ValidationError
  field required (type=value_error.missing)
filters -> 0 -> operator
  value is not a valid enumeration member; permitted: '==', '>', '<', '!=', ....... # and on so on
  filters -> 0 -> filters
field required (type=value_error.missing)

The Class MetadataFilters creates a MetadataFilter object with the required key, operator and value fields.

I have tried the following but still failed:

from llama_index.core.vector_stores import MetadataFilter, MetadataFilters, FilterOperator

filters = MetadataFilters(
  filters=[MetadataFilter(key="content", 
  operator=FilterOperator.CONTAINS, value="*test*")],
)

weaviate_query_engine = weaviate_index.as_query_engine(filters=filters)

question = "Tell me about xxxxxxx."
response = weaviate_query_engine.query(question)
print(response)
ValueError
  in response = weaviate_query_engine(question)
ValueError: Filter operator contains not supported

Looking at the FilterOperator provided by LlamaIndex:

class FilterOperator(str, Enum):
    """Vector store filter operator."""

    # TODO add more operators
    EQ = "=="  # default operator (string, int, float)
    GT = ">"  # greater than (int, float)
    LT = "<"  # less than (int, float)
    NE = "!="  # not equal to (string, int, float)
    GTE = ">="  # greater than or equal to (int, float)
    LTE = "<="  # less than or equal to (int, float)
    IN = "in"  # In array (string or number)
    NIN = "nin"  # Not in array (string or number)
    ANY = "any"  # Contains any (array of strings)
    ALL = "all"  # Contains all (array of strings)
    TEXT_MATCH = "text_match"  # full text match (allows you to search for a specific substring, token or phrase within the text field)
    CONTAINS = "contains"  # metadata array contains value (string or number)

Oh, you are right. Sorry!

I am not an expert in llamaindex, yet :grimacing:

I thought llamaindex would just pass the filter to the client. But looks like there are something:

1 Like

Hi! I am unsure which operator is similar to the function “like”, but seems like this worked for me when I used equal operator “EQ”.

Here is the working one:

from llama_index.core.vector_stores import (
  MetadataFilter,
  MetadataFilters,
  FilterOperator
)

keywords = ["AIS", "DTAC"]
filters = []
metadata_filters = None

if len(keywords):
# For each keywords, create a metadatafilter
  for i, keyword in enumerate(keywords):
    filters.append(MetadataFilter(key="content", operator=FilterOperator.EQ, value=f"*{keyword}*"))

  # Append all metadatafilters together
  metadata_filters = MetadataFilters(filters=filters)

# Create query engine with filters
weaviate_query_engine = weaviate_index.as_query_engine(
  vector_store_query_mode="hybrid",
  alpha=0.2,
  similarity_top_k=5,
  filters=metadata_filters
)

# Test your query
question = "Tell me about the market share of pre-paid mobile in Thailand"
response = weaviate_query_engine.query(question)

for node in response.source_nodes:
  print(node.metadata)
1 Like

Awesome!

Thanks for sharing :slight_smile:

1 Like