from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
def embedder(chunks):
# Preprocess the documents and create TaggedDocuments
tagged_data = [TaggedDocument(words=word_tokenize(chunk.lower()), tags=[str(i)]) for i, chunk in enumerate(chunks)]
# Train the Doc2Vec model
model = Doc2Vec(vector_size=100, min_count=1)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
# Verify the vector size
print("Vector size:", model.vector_size)
embeddings = []
for i, chunk in enumerate(chunks):
# Infer vector for the entire chunk
chunk_vector = model.infer_vector(word_tokenize(chunk.lower()))
embeddings.append(chunk_vector)
return embeddings
This is my embedding that goes in a chunking code. When i saved my chunk and vector in a json file i ask. I wanted to save it in a weavite database.
import weaviate
import json
import pprint
f = open(‘data.json’)
data = json.load(f)
client = weaviate.Client(
url = “http://10.0.129.19:8888”, # Replace with your endpoint
)
Configure a batch process
client.batch.configure(batch_size=100) # Configure batch
with client.batch as batch:
# Batch import all Questions
for i, d in enumerate(data):
print(f"importing answers: {i+1}")
# pprint.pprint(d["Answer"])
properties = {
"answer": d["Answer"]
}
batch.add_data_object(properties, "QWERTY", vector=d["Vector"])
in this code what worked with a other embedding. Is it my embedding code?