- Weaviate Server Version: 1.27.0
- Deployment Method: docker
- Multi Node? Number of Running Nodes: single node
- Client Language and Version: Client: 4.9.3, python
- Multitenancy?: no
I’m encountering an issue with filtering results in Weaviate using the near_text
query.
First of all:
client.collections.create(
name="RAG",
properties=[
wc.Property(name="transcription", data_type=wc.DataType.TEXT),
wc.Property(name="data", data_type=wc.DataType.DATE, inverted_index_config={"IndexTimestamps": True}),
wc.Property(name="hora_inicio_video", data_type=wc.DataType.TEXT),
wc.Property(name="hora_fim_video", data_type=wc.DataType.TEXT),
wc.Property(name="chave_unica", data_type=wc.DataType.TEXT),
wc.Property(name="highlights_assunto", data_type=wc.DataType.TEXT_ARRAY),
wc.Property(name="highlight_start", data_type=wc.DataType.NUMBER),
wc.Property(name="highlight_end", data_type=wc.DataType.NUMBER),
wc.Property(name="action_log", data_type=wc.DataType.TEXT_ARRAY),
wc.Property(name="action_log_start", data_type=wc.DataType.NUMBER),
wc.Property(name="action_log_end", data_type=wc.DataType.NUMBER),
wc.Property(name="location", data_type=wc.DataType.TEXT_ARRAY),
wc.Property(name="offset_start", data_type=wc.DataType.NUMBER),
wc.Property(name="offset_end", data_type=wc.DataType.NUMBER)
],
vectorizer_config=wc.Configure.Vectorizer.text2vec_huggingface(model="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"),
generative_config=wc.Configure.Generative.google(
project_id=project,
model_id="gemini-1.5-pro-preview-0514",
temperature=0.3,
)
)
Here is my initial query that works fine:
response_time_near_text = rag.query.near_text(
query="limpar a piscina",
limit=1,
return_metadata=wq.MetadataQuery(distance=True),
)
This query returns data and metadata successfully. However, when I take one of the metadata values returned (in this case, chave_unica
, but it’s the same for other metadata) and use it as an exact match filter, the query returns no results:
response_time_near_text = rag.query.near_text(
query="limpar a piscina",
limit=1,
return_metadata=wq.MetadataQuery(distance=True),
filters=wq.Filter.by_property("chave_unica").equal("060a2b340101010101010f0013-000000-00000141d33e6dbe-060e2b347f7f-2a80")
)
QueryReturn(objects=[])
The filter value "060a2b340101010101010f0013-000000-00000141d33e6dbe-060e2b347f7f-2a80"
is copied exactly from the metadata returned in the first query, but with this filter applied, the query returns no results.
Additionally, if I use:
response_time_near_text = rag.query.near_text(
query="limpar a piscina",
limit=1,
return_metadata=wq.MetadataQuery(distance=True),
filters=wq.Filter.by_property("chave_unica").not_equal("60a2b340101010101010f0013-000000-00000141d33e6dbe-060e2b347f7f-2a80")
)
In other words, when I perform the query with not_equal
and provide an incorrect value, the query returns data correctly.
Any advice on why this happens or how to fix it would be greatly appreciated. Thank you!
I’ve performed some additional testing.
If I populate my collection with the following code (using langchain):
def carregar_documentos(pasta):
documentos = []
arquivos = os.listdir(pasta)
for arquivo in arquivos:
caminho_completo = os.path.join(pasta, arquivo)
if os.path.isfile(caminho_completo):
with open(caminho_completo, 'r') as f:
dados = json.load(f)
documentos.extend([
Document(page_content=chunk['transcription'], metadata=chunk['metadata'])
for chunk in dados
])
return documentos
documents = carregar_documentos('data/data_constructed')
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
db = WeaviateVectorStore.from_documents(documents, embedding=embeddings, client=client, index_name="RAG")
The search works fine. The issue is that I want to avoid having to define the embeddings like embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
So I tried this other way (batch.dynamic()):
def carregar_documentos(pasta):
documentos = []
arquivos = os.listdir(pasta)
for arquivo in arquivos:
caminho_completo = os.path.join(pasta, arquivo)
if os.path.isfile(caminho_completo):
with open(caminho_completo, "r") as f:
dados = json.load(f)
documentos.extend(
[
{
"page_content": chunk["transcription"],
"metadata": chunk["metadata"],
}
for chunk in dados
]
)
return documentos
data_rows = carregar_documentos('data/data_constructed')
rag = client.collections.get("RAG")
with rag.batch.dynamic() as batch:
for data_row in data_rows:
try:
batch.add_object(properties=data_row)
print(f"Adicionado: {data_row}")
except Exception as e:
print(f"Erro ao adicionar {data_row}: {e}")
for both cases, if I do the test:
response = rag.aggregate.over_all(total_count=True)
everything is fine.