I have a df that contains a sentencized version of about 1100 full-text dissertations on a topic. The df consists of 2 columns: the id of the actual dissertation from another df that contains all of the metadata; and then the full_text in English (I used mBart to translate the non-English disses into English) . I want to work exclusively with local open source tools that I can run on my PC’s GPU - NOT in the cloud. E.g. for the embeddings I want to use the currently leading local language models on the MTEB leaderboard.
Does somebody have a Jupyter notebook for this? As well as a docker-compose.yml with the exact setup I need to run this locally?
I’ve been trying this code
import pandas as pd
import torch
import uuid
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm
import weaviate
from weaviate.connect import Connection
# Initialize Model and Tokenizer
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-large-en-v1.5")
model = AutoModel.from_pretrained("BAAI/bge-large-en-v1.5")
model.to(torch.device("cuda"))
# Function to Compute Embeddings
def compute_embeddings(text):
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(torch.device("cuda"))
with torch.no_grad():
outputs = model(**inputs)
return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy().tolist()
# Function to create text chunks
def create_text_chunks(dataframe, chunk_size=3, overlap=2):
chunks = []
idx = 0
while idx < len(dataframe):
if idx + chunk_size <= len(dataframe):
chunk = ' '.join(dataframe.iloc[idx:idx + chunk_size]['sentence'])
chunks.append(chunk)
idx += chunk_size - overlap # Move the window
return chunks
# Your generated API key
api_key = "MY_GENERATED_API_KEY"
# Initialize Weaviate Client with API key authentication
client = weaviate.Client(url="http://localhost:8080", auth_client_secret=api_key)
# Delete existing class (if needed)
try:
client.schema.delete_class("RFSDPDissesEmbeddingEn")
print("Class RFSDPDissesEmbeddingEn deleted successfully.")
except Exception as e:
print(f"Error deleting class: {e}")
# Create new class schema
class_schema = {
"class": "RFSDPDissesEmbeddingEn",
"description": "Store embeddings of sentences from dissertations",
"properties": [
{
"name": "text",
"dataType": ["text"],
"description": "The text of the sentence",
},
{
"name": "embedding",
"dataType": ["number"],
"description": "The embedding vector",
"vectorIndexType": "hnsw",
"vectorizer": "none",
},
{
"name": "reference",
"dataType": ["int"],
"description": "A reference to the original paper",
}
]
}
# Create new class
try:
client.schema.create_class(class_schema)
print("Class RFSDPDissesEmbeddingEn created successfully.")
except Exception as e:
print(f"Error creating class: {e}")
# Load DataFrame
# df_papers_sents = pd.read_csv('your_csv_file.csv') # Replace with your actual path to the CSV file
# Create chunks
chunks = create_text_chunks(df_papers_sents, chunk_size=3, overlap=2)
# Process Data and Store in Weaviate
batch_size = 256 # Adjust as needed
for start_idx in tqdm(range(0, len(chunks), batch_size), desc="Processing Batches"):
end_idx = start_idx + batch_size
batch_chunks = chunks[start_idx:end_idx]
for index, chunk in enumerate(batch_chunks):
if chunk.strip(): # Check if the chunk is not empty
embedding = compute_embeddings(chunk)
data_object = {
"embedding": embedding,
"text": chunk,
"reference": start_idx + index # Reference to the original position in df
}
unique_id = str(uuid.uuid4())
try:
client.data_object.create(data_object, "RFSDPDissesEmbeddingEn", unique_id)
print(f"Added chunk embedding with UUID: {unique_id}")
except Exception as e:
print(f"Error adding chunk embedding: {e}")
print("Completed processing and storing embeddings.")
But I get
/root/miniconda3/envs/doc2json/lib/python3.8/site-packages/weaviate/warnings.py:15: UserWarning:
Auth001: The client was configured to use authentication, but weaviate is configured without
authentication. Are you sure this is correct?
Error deleting class: Delete class from schema! Unexpected status code: 403, with response body: {'error': [{'message': "forbidden: user 'anonymous' has insufficient permissions to delete schema/objects"}]}.
Error creating class: Create class! Unexpected status code: 403, with response body: {'error': [{'message': "forbidden: user 'anonymous' has insufficient permissions to create schema/objects"}]}.
Any suggestions?