LLamaIndex use text2vec_cohere defined in weaviate cloud instance

I want to insert data into weaviate cloud instance using textevec_cohere and allow weaviate to create the embeddings.
I’m using Llamaindex to get data from web and then insert it but I get an error stating that I have to define openai_key.
Below my code.

import weaviate
import os
from typing import List
from llama_index.core.schema import BaseNode, Document
from llama_index.readers.web import SimpleWebPageReader
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.vector_stores.weaviate import WeaviateVectorStore
from llama_index.core.storage import StorageContext
from llama_index.core import VectorStoreIndex, Settings
import weaviate.classes.config as wc
from dotenv import load_dotenv
from weaviate.classes.init import Auth, AdditionalConfig, Timeout
from weaviate.exceptions import WeaviateBaseError

get the data from the web

def AddData() → (List[BaseNode], List[Document]):
docs = SimpleWebPageReader(html_to_text=True).load_data(
[“LlamaIndex and Weaviate | Weaviate - Vector Database”]
)
print(f"Loaded {len(docs)} documents")
parser = SimpleNodeParser()
nodes = parser.get_nodes_from_documents(docs, show_progress=True)
for n in nodes:
print(n.get_content())
return nodes, docs

client = None
load_dotenv()

try:

headers = {
    # "X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY"),
    "X-Cohere-Api-Key": os.getenv("COHERE_APIKEY")
}

nodes, documents = AddData()

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=os.getenv("WEAVIATE_URL"),
    auth_credentials=Auth.api_key(os.getenv("WEAVIATE_APIKEY")),
    additional_config=AdditionalConfig(
        timeout=Timeout(init=30, query=60, insert=30),  # Values in seconds
    ),

    headers=headers,
    skip_init_checks=False
)

# Necessary for Cohere?
os.environ["COHERE_API_KEY"] = os.getenv("COHERE_APIKEY")

if client.is_ready():
    print(f"Weaviate is ready! Successfully connected to {client.get_meta()}")
else:
    print("Failed to connect to Weaviate Cloud")
    exit(0)

# get existing collections
collections = client.collections.list_all()
if len(collections) > 0:
    print(f"Found {len(collections)} collections:")
    [print(c) for c in collections]
else:
    print("No collections found")
# Check if BlogPosts collection exists
if client.collections.get("BlogPosts").exists():
    print("Collection  'BlogPosts' already exists")
else:
    client.collections.create(
        name="BlogPosts",
        description="A collection of blog posts",
        properties=[
            wc.Property(name="content",
                        data_type=wc.DataType.TEXT,
                        description="The content of the blog post"),
        ],
        # Define the vectorizer module
        vectorizer_config=wc.Configure.Vectorizer.text2vec_cohere(),
        # Define the generative module
        generative_config=wc.Configure.Generative.cohere()
    )

vector_store = WeaviateVectorStore(weaviate_client=client, index_name="BlogPosts", text_key="content")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# we initiate our index
index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)

except WeaviateBaseError as e:
print(f"Failed to connect to Weaviate Cloud: {e.message}“)
exit(0)
except Exception as e:
print(f"An error occurred: {e}”)
exit(0)

finally:
if client is not None:
client.close()

I get error:

Could not load OpenAI embedding model. If you intended to use OpenAI, please check your OPENAI_API_KEY.
Original error:
No API key found for OpenAI.
Please set either the OPENAI_API_KEY environment variable or openai.api_key prior to initialization.
API keys can be found or created at https://platform.openai.com/account/api-keys

Consider using embed_model=‘local’.
Visit our documentation for more embedding options: Redirecting...

Finally solved using:

Settings.embed_model = CohereEmbedding(cohere_api_key=os.getenv(“COHERE_APIKEY”))

Below the revisited code. Any suggestion to improve it further? Am I doing something wrong?

import weaviate
import asyncio
import os
import weaviate.classes.config as wc
from typing import List
from llama_index.core.schema import BaseNode, Document
from llama_index.readers.web import SimpleWebPageReader
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.vector_stores.weaviate import WeaviateVectorStore
from llama_index.core.storage import StorageContext
from llama_index.core import VectorStoreIndex, Settings
from llama_index.embeddings.cohere import CohereEmbedding
from dotenv import load_dotenv
from weaviate.classes.init import Auth, AdditionalConfig, Timeout
from weaviate.exceptions import WeaviateBaseError

get the data from the web

def AddData() → (List[BaseNode], List[Document]):
docs = SimpleWebPageReader(html_to_text=True).load_data(
[“LlamaIndex and Weaviate | Weaviate - Vector Database”]
)
print(f"Loaded {len(docs)} documents")
parser = SimpleNodeParser()
nodes = parser.get_nodes_from_documents(docs, show_progress=True)
for n in nodes:
print(n.get_content())
return nodes, docs

async def main():
client = None
load_dotenv()

try:

    headers = {
        # "X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY"),
        "X-Cohere-Api-Key": os.getenv("COHERE_APIKEY")
    }

    client = weaviate.connect_to_weaviate_cloud(
        cluster_url=os.getenv("WEAVIATE_URL"),
        auth_credentials=Auth.api_key(os.getenv("WEAVIATE_APIKEY")),
        additional_config=AdditionalConfig(
            timeout=Timeout(init=30, query=60, insert=30),  # Values in seconds
        ),

        headers=headers,
        skip_init_checks=False
    )

    # Necessary for Cohere?
    os.environ["COHERE_API_KEY"] = os.getenv("COHERE_APIKEY")

    if client.is_ready():
        print(f"Weaviate is ready! Successfully connected to {client.get_meta()}")
    else:
        print("Failed to connect to Weaviate Cloud")
        exit(0)

    # get existing collections
    collections = client.collections.list_all()
    if len(collections) > 0:
        print(f"Found {len(collections)} collections:")
        [print(c) for c in collections]
    else:
        print("No collections found")
    # Check if BlogPosts collection exists
    if client.collections.get("BlogPosts").exists():
        print("Collection  'BlogPosts' already exists")
    else:
        client.collections.create(
            name="BlogPosts",
            description="A collection of blog posts",
            properties=[
                wc.Property(name="content",
                            data_type=wc.DataType.TEXT,
                            description="The content of the blog post"),
            ],
            # Define the vectorizer module
            vectorizer_config=wc.Configure.Vectorizer.text2vec_cohere(),
            # Define the generative module
            generative_config=wc.Configure.Generative.cohere()
        )

    doInsert = False
    nodes, documents = AddData()
    vector_store = WeaviateVectorStore(weaviate_client=client, index_name="BlogPosts", text_key="content")
    Settings.embed_model = CohereEmbedding(cohere_api_key=os.getenv("COHERE_APIKEY"))
    if doInsert:
        storage_context = StorageContext.from_defaults(vector_store=vector_store)
        # we initiate our index
        index = VectorStoreIndex.from_documents(documents=documents,
                                                storage_context=storage_context,
                                                show_progress=True)

    retriever = VectorStoreIndex.from_vector_store(vector_store).as_retriever(
        similarity_top_k=1
    )

    nodes = retriever.retrieve("What is weaviate?")
    print(nodes[0])

except WeaviateBaseError as e:
    print(f"Failed to connect to Weaviate Cloud: {e.message}")
    exit(0)
except Exception as e:
    print(f"An error occurred: {e}")
    exit(0)

finally:
    if client is not None:
        client.close()

asyncio.run(main())

1 Like

hi!

Thanks for sharing!