import pandas as pd
from datasets import load_dataset
import weaviate
import time
import json
import traceback
client = weaviate.Client(
url="http://<elb-url>",
auth_client_secret=weaviate.AuthApiKey(api_key="<api-key>"),
additional_headers={
"X-OpenAI-Api-Key": "<openai-api-key>"
}
)
def execute_query(class_name, limit):
durations = []
doc_ids = []
distances = []
for query_string in query_strings:
start_time = time.time()
response = (
client.query
.get(class_name, ["doc_id", "title", "text"])
.with_near_text({"concepts": [query_string]})
.with_limit(limit)
.with_additional(["distance"])
.do()
)
end_time = time.time()
durations.append(end_time - start_time)
doc_ids.append([item['doc_id'] for item in response['data']['Get'][class_name]])
distances.append([item['_additional']['distance'] for item in response['data']['Get'][class_name]])
time.sleep(25)
return durations, doc_ids, distances
query_strings=["explain me about animation","show me articles about science fiction stories"]
class_name = "DBPedia1MFlat_DOT"
limit = 3
durations, doc_ids, distances = execute_query(class_name, limit)
class_name = "DBPedia1MHNSW_DOT"
limit = 3
durations_hnsw, doc_ids_hnsw, distances_hnsw = execute_query(class_name, limit)
print(durations)
[0.83345627784729, 1.2585411071777344]
print(durations_hnsw)
[1.060271978378296, 1.1700620651245117]
print(doc_ids)
[['<dbpedia:Animation>', '<dbpedia:Computer_animation>', '<dbpedia:Anime>'], ['<dbpedia:Cyborgs_in_fiction>', '<dbpedia:Cyberpunk>', '<dbpedia:A_Fire_Upon_the_Deep>']]
print(doc_ids_hnsw)
[['<dbpedia:Animation>', '<dbpedia:History_of_animation>', '<dbpedia:Animator>'], ['<dbpedia:Science_fiction_magazine>', '<dbpedia:Space_Science_Fiction>', '<dbpedia:Sci_Fiction>']]
print(distances)
[[-0.87944764, -0.84737694, -0.8314848], [-0.8253831, -0.8045191, -0.8038292]]
print(distances_hnsw)
[[-0.87944764, -0.8792063, -0.85530674], [-0.86453146, -0.8593474, -0.8591885]]
class_name = "DBPedia1MFlat"
limit = 3
durations, doc_ids, distances = execute_query(class_name, limit)
class_name = "DBPedia1MHNSW"
limit = 3
durations_hnsw, doc_ids_hnsw, distances_hnsw = execute_query(class_name, limit)
print(doc_ids)
[['<dbpedia:Animation>', '<dbpedia:Computer_animation>', '<dbpedia:Cartoon>'], ['<dbpedia:Cyborgs_in_fiction>', '<dbpedia:Cyberpunk>', '<dbpedia:A_Fire_Upon_the_Deep>']]
print()
print(doc_ids_hnsw)
[['<dbpedia:Animation>', '<dbpedia:History_of_animation>', '<dbpedia:Animator>'], ['<dbpedia:Science_fiction_magazine>', '<dbpedia:Space_Science_Fiction>', '<dbpedia:Sci_Fiction>']]
print()
print()
print(distances)
[[0.1205529, 0.15262306, 0.18918735], [0.17461729, 0.19548094, 0.19617093]]
print()
print(distances_hnsw)
[[0.1205529, 0.12079358, 0.14469367], [0.13546896, 0.1406529, 0.14081162]]
'''
class_pedia_hnsw_dot = {
"class": "DBPedia1MHNSW_DOT",
"description": "Dataset containing https://huggingface.co/datasets/KShivendu/dbpedia-entities-openai-1M with HNSW and distance dot",
"vectorizer": "text2vec-openai",
"moduleConfig": {
"text2vec-openai": {
"model": "ada",
"modelVersion": "002",
"type": "text",
"vectorizeClassName": False
}
},
"dims": 1536,
"invertedIndexConfig": {
"bm25": {
"b": 0.75,
"k1": 1.2
}
},
"properties": [
{
"name": "doc_id",
"dataType": ["text"],
"moduleConfig": {
"text2vec-openai": {
"vectorizePropertyName": False,
"skip": True,
"tokenization": "lowercase"
}
}
},
{
"name": "title",
"dataType": ["text"],
"moduleConfig": {
"text2vec-openai": {
"vectorizePropertyName": True,
"skip": False,
"tokenization": "lowercase"
}
}
},
{
"name": "text",
"dataType": ["text"],
"moduleConfig": {
"text2vec-openai": {
"vectorizePropertyName": True,
"skip": False,
"tokenization": "lowercase"
}
}
}
],
"vectorIndexType": "hnsw",
"vectorIndexConfig": {
"skip": False,
"cleanupIntervalSeconds": 300,
"maxConnections": 64,
"efConstruction": 256,
"ef": -1,
"dynamicEfMin": 100,
"dynamicEfMax": 500,
"dynamicEfFactor": 50,
"vectorCacheMaxObjects": 2000000,
"flatSearchCutoff": 40000,
"distance": "dot",
"pq": {
"enabled": False,
"bitCompression": False,
"segments": 0,
"centroids": 256,
"trainingLimit": 100000,
"encoder": {
"type": "kmeans",
"distribution": "log-normal"
}
}
}
}
class_pedia_flat_dot = {
"class": "DBPedia1MFlat_DOT",
"description": "Dataset containing https://huggingface.co/datasets/KShivendu/dbpedia-entities-openai-1M with Flat Index and distance dot",
"vectorizer": "text2vec-openai",
"moduleConfig": {
"text2vec-openai": {
"model": "ada",
"modelVersion": "002",
"type": "text",
"vectorizeClassName": False
}
},
"dims": 1536,
"invertedIndexConfig": {
"bm25": {
"b": 0.75,
"k1": 1.2
}
},
"properties": [
{
"name": "doc_id",
"dataType": ["text"],
"moduleConfig": {
"text2vec-openai": {
"vectorizePropertyName": False,
"skip": True,
"tokenization": "lowercase"
}
}
},
{
"name": "title",
"dataType": ["text"],
"moduleConfig": {
"text2vec-openai": {
"vectorizePropertyName": True,
"skip": False,
"tokenization": "lowercase"
}
}
},
{
"name": "text",
"dataType": ["text"],
"moduleConfig": {
"text2vec-openai": {
"vectorizePropertyName": True,
"skip": False,
"tokenization": "lowercase"
}
}
}
],
"vectorIndexType": "flat",
"vectorIndexConfig": {
"distance": "dot",
"vectorCacheMaxObjects": 2000000,
"pq": {
"enabled": False,
"rescoreLimit": -1,
"cache": False
},
"bq": {
"enabled": True,
"rescoreLimit": 200,
"cache": True
}
}
}
print("Load the Hugging Face dataset")
dataset = load_dataset("KShivendu/dbpedia-entities-openai-1M")
def import_data(class_name, dataset,batch_size=100, num_workers=1):
counter = 0
interval = 1000
client.batch.configure(batch_size=batch_size, num_workers=num_workers)
start_time = time.time()
with client.batch as batch:
for item in dataset['train']:
try:
properties = {
"doc_id": item["_id"],
"title": item["title"],
"text": item["text"]
}
batch.add_data_object(
data_object=properties,
class_name=class_name,
vector=item["openai"]
)
counter += 1
if counter % interval == 0:
print(f"Imported {counter} articles...")
except Exception as e:
print(f"Error importing article {item['id']}: {e}")
traceback.print_exc()
# Flush remaining items in the batch
batch.flush()
end_time = time.time()
return end_time - start_time
client.schema.create_class(class_pedia_hnsw_dot)
client.schema.create_class(class_pedia_flat_dot)
print(f"Running import with HNSW Index")
duration_dbpedia_hnsw = import_data("DBPedia1MHNSW_DOT",dataset, 1000, 4)
print(f"Running import with Flat Index")
duration_dbpedia_flat = import_data("DBPedia1MFlat_DOT",dataset, 1000, 4)