We are using batch.
While ingestion we need to check if record exist or not than inserting.
Sample code provide, how we are ingesting data in some case.
while updation time is huge for below approach.
Please suggest any optimized way ?
class_name = "Book"
if client.collections.exists(class_name):
client.collections.delete(class_name)
client.collections.create(
name=class_name,
vectorizer_config=wcc.Configure.Vectorizer.text2vec_transformers(),
multi_tenancy_config=wcc.Configure.multi_tenancy(enabled=True),
inverted_index_config=wcc.Configure.inverted_index(
index_timestamps = True
),
properties=[
wcc.Property(
name="Book_name",
data_type=wcc.DataType.TEXT,
tokenization=wcc.Tokenization.FIELD,
),
wcc.Property(
name="Author",
data_type=wcc.DataType.TEXT,
tokenization=wcc.Tokenization.WORD,
skip_vectorization=True,
),
wcc.Property(
name="Book_Summary",
data_type=wcc.DataType.TEXT,
tokenization=wcc.Tokenization.FIELD,
),
wcc.Property(
name="Update_date",
data_type=wcc.DataType.TEXT,
tokenization=wcc.Tokenization.FIELD,
skip_vectorization=True,
),
],
)
def check_book_exist(client, class_name, value, tenant):
filter = wvc.query.Filter.by_property("Book_name").equal(value)
class_obj = client.collections.get(class_name).with_tenant(tenant)
response = class_obj.query.fetch_objects(filters=filter, limit=1)
uuid = None
for o in response.objects:
uuid = o.uuid
return uuid
def update_records(uuid, collection, data, tenant, client):
class_obj = client.collections.get(collection).with_tenant(tenant)
class_obj.data.update(
uuid=uuid,
properties=data,
)
def parser(data,tenant):
with client.batch.fixed_size(batch_size=100) as batch:
book_uuid = check_book_exist(
client = client,
class_name= "Book",
value = data.get("Book_name"),
tenant = tenant,
)
if book_uuid is None:
book_data = { "Book_name": data.get("Book_name"),
"Author": data.get("Author"),
"Book_Summary": data.get("Book_Summary"),
"Update_date": data.get("Update_date"),
}
batch.add_object(
properties=book_data,
collection="Book",
tenant=tenant,
)
print("Added the record for book name", data.get("Book_name"))
else:
update_records(
tenant=tenant,
client=client,
uuid=book_uuid,
collection="Book",
data={"Update_date": data.get("Update_date")},
)
print("Update the record for book name", data.get("Book_name"))