Description
We are moving from v3 to v4. Now, when I use batch upload or multi-insert, I get an error message saying the text is too long for vectorization.
Can anyone help me regarding this?
Below is my code.
def vectorize_tag_page_data(texts, class_name, layer_name):
# Append layer suffix to class name
class_name = f"{class_name}01pagedata"
# Load configuration from environment variables
weaviate_url = os.getenv(f"URL")
weaviate_auth_key = os.getenv(f"AUTH_KEY")
openai_key = os.getenv("OPENAI_API_KEY")
if not weaviate_url or not weaviate_auth_key or not openai_key:
raise EnvironmentError("One or more required environment variables are missing")
# Prepare data objects for insertion
data_objs = [{"text": texts[key], "metadata": key} for key in texts]
total = len(data_objs)
print(f"\n{total} data objects prepared for insertion.\n")
print(f"Layer URL: {weaviate_url}")
# Initialize client with authentication
client = initialize_weaviate_client(weaviate_url, weaviate_auth_key, openai_key)
# Create collection in Weaviate
try:
response = client.collections.create(
name=class_name,
vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(),
properties=[
wvc.config.Property(name="text", data_type=wvc.config.DataType.TEXT),
wvc.config.Property(name="metadata", data_type=wvc.config.DataType.TEXT),
],
)
print(response.config.get(simple=False))
except Exception as e:
print(f"Error while creating collection: {e}")
finally:
client.close()
# Reinitialize client for data insertion
client = initialize_weaviate_client(weaviate_url, weaviate_auth_key, openai_key)
# Insert data using batching
try:
collection = client.collections.get(class_name)
with collection.batch.dynamic() as batch:
print("Batch insertion started.")
for i, data_obj in enumerate(data_objs, 1):
batch.add_object(properties=data_obj)
print(f"Uploaded Tag Data: {i}/{total}")
# Check for batch insertion errors
if batch.number_errors > 0:
print(f"Number of errors during batch insertion: {batch.number_errors}")
else:
print("Batch insertion completed successfully.")
# Optional: Verify insertion by querying the collection
try:
result = collection.query.bm25(query="genAI", limit=10)
print("\nQuery Results:", result)
except Exception as e:
print(f"Error while querying: {e}")
except Exception as e:
print(f"An exception occurred: {e}")
finally:
if client is not None:
client.close()
def initialize_weaviate_client(url, auth_key, openai_key):
client = wvc.Client(
url=url,
auth_client_secret=wvc.AuthApiKey(api_key=auth_key),
additional_headers={
"X-OpenAI-Api-Key": openai_key
}
)
return client
Server Setup Information
- Weaviate Server Version: 4.8.1
- Deployment Method: I am using on python directly
- Multi Node? Number of Running Nodes: 1
- Client Language and Version: Python 3.12.3
- Multitenancy?:
Any additional Information
[ErrorObject(message=“WeaviateInsertManyAllFailedError(‘Every object failed during insertion. Here is the set of all errors: text too long for vectorization’)”, object_=BatchObject(collection=‘Sswhhsdflesdfesssssr01pagedata’, vector=None, uuid=‘fb5fc0a6-f652-4e64-bc36-d1bcc0536e0b’, properties={‘text’: ‘a’, ‘metadata’: ‘name1’}, tenant=None, references=None, index=0, retry_count=0), original_uuid=None), ErrorObject(message=“WeaviateInsertManyAllFailedError(‘Every object failed during insertion. Here is the set of all errors: text too long for vectorization’)”, object=_BatchObject(collection=‘Sswhhsdflesdfesssssr01pagedata’, vector=None, uuid=‘e21581f3-c7dd-447b-b240-bf566314eea7’, properties={‘text’: ‘1’, ‘metadata’: ‘value1’}, tenant=None, references=None, index=1, retry_count=0), original_uuid=None)]
data I am trying to insert = [{‘text’: ‘a’, ‘metadata’: ‘name1’}, {‘text’: ‘1’, ‘metadata’: ‘value1’}]