The script runs without error, all the sequences are transferred correctly but the images I see only ~ 7M entries on the new instance whereas the old one has ~ 14 M how is this possible? Can the uuids be duplicates in the original db? I repeated it with same result.
with get_client() as old_client:
with get_client_wcs() as new_client:
if not new_client.collections.exists("Sequences"):
new_client.collections.create(
name = "Sequences",
vector_index_config=Configure.VectorIndex.hnsw(quantizer=None),
properties=[
Property(name="source_id", data_type=DataType.INT),
Property(name="vectorizer_model", data_type=DataType.TEXT),
Property(name="vectorized_text", data_type=DataType.TEXT),
Property(name="title", data_type=DataType.TEXT),
Property(name="description", data_type=DataType.TEXT),
Property(name="source_type", data_type=DataType.TEXT),
Property(name="page_url", data_type=DataType.TEXT),
Property(name="date_added", data_type=DataType.DATE)
]
)
# Create the Images class
if not new_client.collections.exists("Images"):
images_collection = new_client.collections.create(
vector_index_config=Configure.VectorIndex.hnsw(quantizer=None),
name="Images",
properties=[
Property(name="save_url", data_type=DataType.TEXT),
Property(name="source_id", data_type=DataType.INT),
Property(name="vectorizer_model", data_type=DataType.TEXT),
Property(name="order", data_type=DataType.INT),
Property(name="image_height", data_type=DataType.INT),
Property(name="image_width", data_type=DataType.INT),
Property(name="date_added", data_type=DataType.DATE)
],
)
images_collection.config.add_reference(
ReferenceProperty(
name="fromSequence",
target_collection="Sequences",
description="Sequence that this image is from"
)
)
images_collection_new = new_client.collections.get("Images")
sequences_collection_new = new_client.collections.get("Sequences")
images_collection_old = old_client.collections.get("Images")
sequences_collection_old = old_client.collections.get("Sequences")
with sequences_collection_new.batch.dynamic() as batch:
for q in tqdm(sequences_collection_old.iterator(include_vector=False),desc="Copying Sequences"):
properties = q.properties
properties["source_id"] = int(properties["source_id"])
batch.add_object(
properties=properties,
uuid=q.uuid
)
with images_collection_new.batch.fixed_size(concurrent_requests=10,batch_size=500) as batch:
for q in tqdm(images_collection_old.iterator(include_vector=True,return_references=[QueryReference(link_on="source_sequence")]),desc="Copying Images"):
properties = q.properties
properties["source_id"] = int(properties["source_id"])
# if images_collection_new.query.fetch_object_by_id(q.uuid):
# continue
try:
references = {"fromSequence": q.references["source_sequence"].objects[0].uuid}
except:
references = None
batch.add_object(
properties=properties,
uuid=q.uuid,
vector=q.vector["default"],
references=references
)