[Question] client.batch.failed_objects or collection.batch.failed_objects for the failed objects.

haozhuoyuan · July 29, 2024, 3:39pm

import time
import weaviate
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.weaviate import WeaviateVectorStore
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import StorageContext, Settings
from llama_index.readers.file import PyMuPDFReader
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from dotenv import load_dotenv, find_dotenv
from weaviate.classes.query import MetadataQuery
# Load environment variables
_ = load_dotenv(find_dotenv())
import nest_asyncio
nest_asyncio.apply()  # Only needed in Jupyter notebooks

# 连接到local,需要启动docker
# weaviate_client = weaviate.connect_to_local(host="localhost", port=8080, grpc_port=50051, skip_init_checks=True)
# weaviate_client = weaviate.Client("http://localhost:8080")
weaviate_client = weaviate.connect_to_local()
# Set global LLM and embedding models
Settings.llm = OpenAI(temperature=0, model="gpt-4o")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small", dimensions=512)
splitter = SentenceSplitter(chunk_size=512, chunk_overlap=100)
# Load PDF documents
# documents = SimpleDirectoryReader("./data", file_extractor={".pdf": PyMuPDFReader()}).load_data()
documents = SimpleDirectoryReader("./data1").load_data()

# Split nodes
nodes = splitter.get_nodes_from_documents(documents)
print(nodes)
# schema = {
#     "classes": [
#         {
#             "class": "TextNode",
#             "properties": [
#                 {"name": "id_", "dataType": ["string"]},
#                 {"name": "embedding", "dataType": ["number[]"]},
#                 {"name": "file_path", "dataType": ["string"]},
#                 {"name": "file_name", "dataType": ["string"]},
#                 {"name": "file_type", "dataType": ["string"]},
#                 {"name": "file_size", "dataType": ["int"]},
#                 {"name": "creation_date", "dataType": ["string"]},
#                 {"name": "last_modified_date", "dataType": ["string"]},
#                 {"name": "source", "dataType": ["string"]},
#                 {"name": "text", "dataType": ["text"]},
#                 {"name": "start_char_idx", "dataType": ["int"]},
#                 {"name": "end_char_idx", "dataType": ["int"]},
#                 {"name": "metadata_str", "dataType": ["string"]},
#                 {"name": "content", "dataType": ["text"]},
#             ],
#         },
#     ]
# }
# try:
if weaviate_client.collections.exists("TextNode"):
    weaviate_client.collections.delete("TextNode")
schema = {
           "class": "TextNode",
           "properties": [
               {"name": "id_", "dataType": ["string"], },
               {"name": "embedding", "dataType": ["number[]"], },
               {"name": "file_path", "dataType": ["string"], },
               {"name": "file_name", "dataType": ["string"], },
               {"name": "file_type", "dataType": ["string"], },
               {"name": "file_size", "dataType": ["int"], },
               {"name": "creation_date", "dataType": ["string"], },
               {"name": "last_modified_date", "dataType": ["string"], },
               # {"name": "source", "dataType": ["string"], },
               {"name": "text", "dataType": ["text"], },
               {"name": "start_char_idx", "dataType": ["int"], },
               {"name": "end_char_idx", "dataType": ["int"], }
               # {"name": "metadata_str", "dataType": ["string"], },
               # {"name": "content", "dataType": ["text"], },
           ]
       }
weaviate_client.collections.create_from_dict(schema)
# finally:
#     weaviate_client.close()
# if not weaviate_client.schema.contains(schema):
#     weaviate_client.schema.create(schema)
# if not weaviate_client.collections.exists("TextNode"):
#     weaviate_client.collections.create("TextNode")
# # 删除现有的类（如果存在）
# if weaviate_client.schema.contains("TextNode"):
#     weaviate_client.schema.delete_class("TextNode")


# 将节点数据添加到 Weaviate
try:
    collection = weaviate_client.collections.get("TextNode")
    data_lines = []
    for node in nodes:
        embedding = Settings.embed_model.get_text_embedding(node.text)  # 生成嵌入
        node.embedding = embedding  # 设置嵌入
        properties = {
            "id": node.id_,
            "embedding": node.embedding,
            "file_path": node.metadata.get("file_path"),
            "file_name": node.metadata.get("file_name"),
            "file_type": node.metadata.get("file_type"),
            "file_size": node.metadata.get("file_size"),
            "creation_date": node.metadata.get("creation_date"),
            "last_modified_date": node.metadata.get("last_modified_date"),
            # "source": node.metadata.get("source"),
            "text": node.text,
            "start_char_idx": node.start_char_idx,
            "end_char_idx": node.end_char_idx,
            # "metadata_str": node.metadata_template,
            # "content": node.text,
        }
        data_lines.append(properties)
    print(data_lines)
    with collection.batch.dynamic() as batch:
        for data_line in data_lines:
            batch.add_object(properties=data_line)
    print("node insert completation！！！！！！！！！！！")


    # jeopardy = weaviate_client.collections.get("TextNode")
    # response = collection.query.near_text(
    #     query="docker部署",
    #     limit=2,
    #     return_metadata=MetadataQuery(distance=True)
    # )
    #
    # for o in response.objects:
    #     print(o.properties)
    #     print(o.metadata.distance)
    # 使用 REST API 进行查询
    # query = {
    #     "query": {
    #         "nearText": {
    #             "concepts": ["docker部署"],
    #         }
    #     },
    #     "limit": 2,
    #     "class": "TextNode"
    # }
    #
    # response = weaviate_client.collections.get(query)
    # print(response)
    #
    # for o in response['data']['Get']['TextNode']:
    #     print(o['properties'])
    #     print(o['_additional']['distance'])

    # from weaviate.collections import Collection
    #
    # my_collection = weaviate_client.collections.get("TextNode")
    #
    #
    # def work_with_collection(collection: Collection):
    #     # Do something with the collection, e.g.:
    #     r = collection.query.near_text(query="docker部署")
    #     return r
    # response = work_with_collection(my_collection)
    # for o in response['data']['Get']['TextNode']:
    #     print(o['properties'])
    #     print(o['_additional']['distance'])
    # exit()
    # Create Vector Store
    # vector_store = WeaviateVectorStore(weaviate_client=weaviate_client, index_name="TextNode", text_key="content")
    vector_store = WeaviateVectorStore(weaviate_client=weaviate_client, index_name="TextNode")
    # vector_store.delete_nodes()
    # Specify Vector Store for index
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    index = VectorStoreIndex.from_vector_store(vector_store)
    # index = VectorStoreIndex.from_vector_store()
    print(index.index_struct)
    print(index.storage_context)

    query_engine = index.as_query_engine()

    while True:
        question = input("User: ")
        if question.strip() == "":
            break
        start_time = time.time()
        response = query_engine.query(question)
        end_time = time.time()
        print(f"Time taken: {end_time - start_time} seconds")
        print(f"AI: {response}")
finally:
    weaviate_client.close()

DudaNogueira · July 29, 2024, 5:47pm

hi @haozhuoyuan !

Welcome to our community!

Not sure what is your question here

If you are looking for error handling by doing batches:

Let me know if this helps.

Topic		Replies	Views
[Question] client.batch.failed_objects Support technical	1	740	July 30, 2024
Text2vec-openai Batch API Support integration , wcs , python	1	222	July 8, 2024
Weaviate Batch Errors during Batch Insertion with v4 client Support bug , developer-experience , wcs , python , documentation	11	1280	May 15, 2024
Batch insert error Support	1	178	November 21, 2024
Embeded weaviate with objects stuck at 20; connect_to_local shows thousands objects Support python	14	286	November 14, 2024

[Question] client.batch.failed_objects or collection.batch.failed_objects for the failed objects.

Related topics