Iterator not returning a property

Have this very simple python code:

def main():
    client = init_weaviate_client()
    try:
        collection = client.collections.get(os.getenv("COP_COPERTINE_COLLNAME"))
        extractor = ManifestoGPTExtractor()
        for obj in collection.iterator():
            extractor.process_object(collection, obj)
    finally:
        client.close()

and I have a breakpoint on the process_object line and inspecting the obj variable I see:


where you can see we do NOT have a “editionImageStr” property which should hold a 250-300K Base64 image.

I am positive that ALL objects in the collection DO HAVE that property as you can see from just a snapshot of one of them through Postman on the same collection:

cannot find any details on why iterate() is apparently “losing” that BLOB property. I found a post by @sebawita speaking of a search function:

The Python client v4, doesn’t return blob properties by default, as these are potentially large values.
However, if you run a query with return_properties , I would expect values for the selected properties to be present.

but I cannot find a way for this to work with the iterator(). May this “bug” be related?

Here is the collection definition:

COPERTINE_COLL_CONFIG = {
    "class": COP_COPERTINE_COLLNAME,  
    "description": "Collection of Il Manifesto newspaper covers",
    "vectorizer": "none",  
    "properties": [
        Property(
            name="editionId", 
            data_type=DataType.TEXT,
            description="Unique identifier for the edition",
            tokenization="field",
            index_searchable=False
        ),
        Property(
            name="editionDateIsoStr",
            data_type=DataType.DATE,
            description="Publication date of the edition"
        ),
        Property(
            name="editionImageStr",
            data_type=DataType.BLOB,  
            description="Base64 encoded image string"
        ),
        Property(
            name="captionAIStr",
            data_type=DataType.TEXT,  
            description="Image caption as recognized by the AI model"
        ),
        Property(
            name="imageAIDeStr",
            data_type=DataType.TEXT,  
            description="Image description as generated by the AI model"
        ),
        Property(
            name="modelAIName",
            data_type=DataType.TEXT,  
            description="AI model name",
            tokenization="field",
            index_searchable=False
        ),
    ]
}

I do need to iterate on this collection and process the images attached to each object in that property. Thanks

Server Setup Information

  • Weaviate Server Version: 1.25.4
  • Deployment Method: docker
  • Multi Node? No
  • Client Language and Version: Python 4.10.2
  • Multitenancy?: No

Cia amigo @rjalex !!

Sorry for the late response. I was on vacanze :wink:

Here a MVE:

import weaviate
from langchain_huggingface import HuggingFaceEmbeddings
import weaviate.classes.config as wc
from weaviate import classes as wvc

client = weaviate.connect_to_local()
print(f"Client: {weaviate.__version__}, Server: {client.get_meta().get('version')}")

client.collections.delete("Test")
collection = client.collections.create(
    name = 'Test',
    properties=[
        wc.Property(name = 'text', data_type = wc.DataType.TEXT),
        wc.Property(name = 'blob', data_type = wc.DataType.BLOB),
    ]
)
import base64
with open("/Users/dudanogueira/Pictures/sample.png", 'rb') as file:
    blob = base64.b64encode(file.read()).decode('utf-8')
collection.data.insert(
    properties={"text": "example", "blob": blob},
)
# check if we have the objects:
print(collection.aggregate.over_all().total_count)

for o in collection.iterator(return_properties=["text", "blob"]):
    print(o.properties)

I can see the blob there :grimacing: using:
Client: 4.10.4, Server: 1.28.3

Let me know if this helps.

1 Like

Never apologize for taking time out my friend :slight_smile:

Actually after some thought I decided that architecturally in my case having images in the database was not the best pattern and decided to store them on the filesystem and just have a link for them in the weaviate object.

Having said that it is very useful to know that if needed it can be done.

If anyone will stumble on this problem/solution here is the complete revised code of your example:

pyproject.toml:

[tool.poetry]

name = "weaviateblobtest"

version = "0.1.0"

description = ""
authors = ["Duda & Bob <vacations@are.fun>"]
readme = "README.md"
package-mode = false

[tool.poetry.dependencies]
python = "^3.11"
weaviate-client = "^4.10.4"
pybase64 = "^1.4.0"

[tool.poetry.group.dev.dependencies]
mypy = "^1.8.0"

[build-system]
requires = ["poetry-core"]

build-backend = "poetry.core.masonry.api"

and the full program with error handling, context manager and nice prints :slight_smile:

"""This module demonstrates how images can be stored and retrieved as base64-encoded blobs
in a Weaviate collection. It shows basic operations like creating a collection with blob
support, inserting an image as a blob, and retrieving it back from the collection.
"""
import base64
import weaviate
from weaviate.exceptions import WeaviateConnectionError
import weaviate.classes.config as wc
from weaviate.collections import Collection

def create_test_collection(client: weaviate.WeaviateClient) -> Collection:
    """Create a test collection with text and blob properties."""
    # First ensure any existing collection is removed
    try:
        client.collections.delete("Test")
    except Exception:
        # Ignore errors when trying to delete non-existent collection
        pass
    
    return client.collections.create(
        name='Test',
        properties=[
            wc.Property(name='text', data_type=wc.DataType.TEXT),
            wc.Property(name='blob', data_type=wc.DataType.BLOB),
        ]
    )

def main() -> None:
    """Main function to demonstrate blob storage and retrieval."""
    try:
        with weaviate.connect_to_local() as client:
            print(f"Client: {weaviate.__version__}, Server: {client.get_meta().get('version')}")
            
            collection = create_test_collection(client)
            
            # Read and encode the image
            try:
                with open("image.jpg", 'rb') as file:
                    blob = base64.b64encode(file.read()).decode('utf-8')
            except FileNotFoundError:
                print("Error: image.jpg not found in the current directory")
                return
            except Exception as e:
                print(f"Error reading image file: {e}")
                return
            
            # Insert the blob
            try:
                collection.data.insert(
                    properties={"text": "Text field describing the image", "blob": blob},
                )
            except Exception as e:
                print(f"Error inserting data into collection: {e}")
                return
            
            # Verify the data
            try:
                count = collection.aggregate.over_all().total_count
                print(f"Total objects in collection: {count}")
                
                print("\nRetrieved objects:")
                for obj in collection.iterator(return_properties=["text", "blob"]):
                    properties = obj.properties
                    print(f"Text: {properties.get('text')}")
                    print(f"Blob length: {len(str(properties.get('blob')))}")
            except Exception as e:
                print(f"Error retrieving data from collection: {e}")
                return

    except WeaviateConnectionError:
        print("Error: Could not connect to Weaviate. Make sure the Weaviate instance is running locally.")
    except Exception as e:
        print(f"Unexpected error: {e}")

if __name__ == "__main__":
    main()
1 Like