Error : text too long for vectorization

Description

We are moving from v3 to v4. Now, when I use batch upload or multi-insert, I get an error message saying the text is too long for vectorization.

Can anyone help me regarding this?

Below is my code.

def vectorize_tag_page_data(texts, class_name, layer_name):

# Append layer suffix to class name
class_name = f"{class_name}01pagedata"

# Load configuration from environment variables
weaviate_url = os.getenv(f"URL")
weaviate_auth_key = os.getenv(f"AUTH_KEY")
openai_key = os.getenv("OPENAI_API_KEY")

if not weaviate_url or not weaviate_auth_key or not openai_key:
    raise EnvironmentError("One or more required environment variables are missing")

# Prepare data objects for insertion
data_objs = [{"text": texts[key], "metadata": key} for key in texts]
total = len(data_objs)

print(f"\n{total} data objects prepared for insertion.\n")
print(f"Layer URL: {weaviate_url}")

# Initialize client with authentication
client = initialize_weaviate_client(weaviate_url, weaviate_auth_key, openai_key)

# Create collection in Weaviate
try:
    response = client.collections.create(
        name=class_name,
        vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(),
        properties=[
            wvc.config.Property(name="text", data_type=wvc.config.DataType.TEXT),
            wvc.config.Property(name="metadata", data_type=wvc.config.DataType.TEXT),
        ],
    )
    print(response.config.get(simple=False))
except Exception as e:
    print(f"Error while creating collection: {e}")
finally:
    client.close()

# Reinitialize client for data insertion
client = initialize_weaviate_client(weaviate_url, weaviate_auth_key, openai_key)

# Insert data using batching
try:
    collection = client.collections.get(class_name)
    with collection.batch.dynamic() as batch:
        print("Batch insertion started.")
        for i, data_obj in enumerate(data_objs, 1):
            batch.add_object(properties=data_obj)
            print(f"Uploaded Tag Data: {i}/{total}")

        # Check for batch insertion errors
        if batch.number_errors > 0:
            print(f"Number of errors during batch insertion: {batch.number_errors}")
        else:
            print("Batch insertion completed successfully.")

    # Optional: Verify insertion by querying the collection
    try:
        result = collection.query.bm25(query="genAI", limit=10)
        print("\nQuery Results:", result)
    except Exception as e:
        print(f"Error while querying: {e}")
except Exception as e:
    print(f"An exception occurred: {e}")
finally:
    if client is not None:
        client.close()

def initialize_weaviate_client(url, auth_key, openai_key):

client = wvc.Client(
    url=url,
    auth_client_secret=wvc.AuthApiKey(api_key=auth_key),
    additional_headers={
        "X-OpenAI-Api-Key": openai_key
    }
)
return client

Server Setup Information

  • Weaviate Server Version: 4.8.1
  • Deployment Method: I am using on python directly
  • Multi Node? Number of Running Nodes: 1
  • Client Language and Version: Python 3.12.3
  • Multitenancy?:

Any additional Information

[ErrorObject(message=“WeaviateInsertManyAllFailedError(‘Every object failed during insertion. Here is the set of all errors: text too long for vectorization’)”, object_=BatchObject(collection=‘Sswhhsdflesdfesssssr01pagedata’, vector=None, uuid=‘fb5fc0a6-f652-4e64-bc36-d1bcc0536e0b’, properties={‘text’: ‘a’, ‘metadata’: ‘name1’}, tenant=None, references=None, index=0, retry_count=0), original_uuid=None), ErrorObject(message=“WeaviateInsertManyAllFailedError(‘Every object failed during insertion. Here is the set of all errors: text too long for vectorization’)”, object=_BatchObject(collection=‘Sswhhsdflesdfesssssr01pagedata’, vector=None, uuid=‘e21581f3-c7dd-447b-b240-bf566314eea7’, properties={‘text’: ‘1’, ‘metadata’: ‘value1’}, tenant=None, references=None, index=1, retry_count=0), original_uuid=None)]

data I am trying to insert = [{‘text’: ‘a’, ‘metadata’: ‘name1’}, {‘text’: ‘1’, ‘metadata’: ‘value1’}]

hi @Sumat_Mallick !!

Welcome to our community :hugs:

Can you past the entire stace track?

Also, can you share here the code snippet so I can try to reproduce this?

Thanks!

I am having the same issue.
See the following links for reference. @Dirk has also been investigating this.

Thank you @DudaNogueira

After further debugging, I found the solution.

I have been using the free version of Weaviate for the last two months, by extending it for another two weeks.

When I move to the new Weaviate layer, it starts working fine.

New Code:

if not client.collections.exists(class_name):
        bot = client.collections.create(
            name=class_name,
            vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(  
                model="ada", 
                model_version="002",
                vectorize_collection_name = False
            ),
            properties=[
                wvc.config.Property(
                    name="text",
                    data_type=wvc.config.DataType.TEXT,  
                    vectorize_property_name=True  
                ),
                wvc.config.Property(
                    name="metadata",
                    data_type=wvc.config.DataType.TEXT,  
                    vectorize_property_name=True  
                )
            ]
        )

Old Code

try:
    bot = client.collections.create(
    name=class_name,
    vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai( 
        model="ada", 
        model_version="002",
        vectorize_collection_name = False
    ),
    properties=[
        wvc.config.Property(
            name="text",
            data_type=wvc.config.DataType.TEXT, 
            vectorize_property_name=True  
        ),
        wvc.config.Property(
            name="metadata",
            data_type=wvc.config.DataType.TEXT,  
            vectorize_property_name=False  
        ),
    ]
)

except Exception as e:
    print("Error:", e)
    print("--------*--------**")
    bot = client.collections.get(class_name)