import weaviate
from weaviate import Config
import weaviate.classes as wvc
import weaviate.exceptions
import os
import pandas as pd
import numpy as np
import json
import datetime
from datetime import datetime
Starting up the weaviate client
auth_config = weaviate.auth.AuthApiKey(api_key=“API”)
client = weaviate.Client(
url=“https://someweaviate.network”,
auth_client_secret=auth_config,
additional_headers={
“X-HuggingFace-Api-Key”: “API_hf”
}
)
client.is_ready()
Deleting any previously existing “MachineFailures” class
print(“delete previous”)
client.schema.delete_class(“MachineFailures”)
Creating a new class with the defined schema
#Here “vectorizer”: “text2vec-transformers” it is using: the default transformer model used was bert-base-uncased
.
#If you need to use a specific transformer model, such as paraphrase-multilingual-mpnet-base-v2
,
#and Weaviate’s default model does not meet your requirements, you may need to vectorize your text data outside of Weaviate using the Sentence Transformers library,
#as I described in the previous answer, and then store the resulting vectors in Weaviate manually.
Created all the properties with ‘text’ so it enables with semantic and keyword search(Hybrid search)
client.schema.create_class(
{
“class”: “MachineFailures”,
“description”: “A class to store machine failure records”,
“vectorIndexConfig”: {
“distance” : “cosine”
},
“vectorIndexType”: “hnsw”,
“vectorizer”: “text2vec-huggingface”,
“moduleConfig”: {
“text2vec-huggingface”: {
“model”: “sentence-transformers/paraphrase-multilingual-mpnet-base-v2”,
}
},
“properties”: [
{
“name”: “description”,
“dataType”: [“text”],
},
{
“name”: “wo_number”,
“dataType”: [“number”],
},
{
“name”: “heading”,
“dataType”: [“text”],
},
{
“name”: “fail_source”,
“dataType”: [“text”],
},
{
“name”: “fail_cause”,
“dataType”: [“text”],
},
{
“name”: “wo_closed_on”,
“dataType”: [“date”],
},
{
“name”: “repairman”,
“dataType”: [“text”],
},
{
“name”: “working_hours”,
“dataType”: [“number”],
},
{
“name”: “machine_no”,
“dataType”: [“text”],
},
{
“name”: “workdone_comments”,
“dataType”: [“text”],
},
],
}
)
Checking is the collection is created successfully or not
print(“create new”)
print(client.collection.exists(“MaichineFailures”))
Importing the data using pandas
data = pd.read_csv(‘./data/MachineFailures.csv’, index_col=0)
Getting the collection “DiseaseSearch” that was created earlier
Machine_Failures_data = client.collection.get(“MachineFailures”)
Function to format dates in Weaviate ISO8601 format
def format_date_weaviate(date):
# Convert the date to a pandas datetime object
date_obj = pd.to_datetime(date, errors=‘coerce’) # ‘coerce’ will convert invalid dates to NaT
# Check if the date is NaT (Not-a-Time)
if pd.isna(date_obj):
return None # or return a default date string in the correct format, if applicable
# Format the date to Weaviate’s expected ISO8601 format
return date_obj.strftime(“%Y-%m-%dT%H:%M:%S+00:00”)
Iterating through the dataset and storing it all in an array to be inserted later
objects_to_add = [
{
“description”: row[“description”],
“wo_number”: row[“wo_number”],
“heading”: row[“heading”],
“fail_source”: row[“fail_source”],
“fail_cause”: row[“fail_cause”],
“wo_closed_on”:format_date_weaviate(row[“wo_closed_on”]),
“repairman”: row[“repairman”],
“working_hours”: row[“working_hours”], # Convert to string
“machine_no”: row[“machine_no”], # Convert to string if needed
“workdone_comments”: row[“workdone_comments”],
}
for index, row in data.iterrows()
]
Define a function to replace non-compliant float values
def replace_non_compliant_values(value):
if isinstance(value, float) and (np.isnan(value) or np.isinf(value)):
return None # Replace with None or an appropriate placeholder
return value
Inserting the data into the class
for obj in objects_to_add:
# Replace non-compliant float values in the object
sanitized_obj = {k: replace_non_compliant_values(v) for k, v in obj.items()}
client.data_object.create(sanitized_obj, “MachineFailures”)
Fetching any 5 objects from the class and printing the response
query_string = “”"
{
Get {
MachineFailures(limit: 5) {
description
wo_number
machine_no
heading
repairman
}
}
}
“”"
response = client.query.raw(query_string)
print(“Output”)
print(response)
But in my docker using text2vec-transformers module i had success to import that data.
WHat can be the problem. Thanks in advance.
//Ricky