Hello
I am new to weaviate, I try to use below code embedding some documents to a class using DEFAULT_VECTORIZER_MODULE: ‘text2vec-contextionary’
192.168.1.86:8080 is my weaviate server.
My configure file for the server
- --scheme
- http
image: semitechnologies/weaviate:1.23.2
ports:
- 8080:8080
- 50051:50051
volumes:
- ./weaviate_data:/var/lib/weaviate
restart: on-failure:0
environment:
CONTEXTIONARY_URL: contextionary:9999
QUERY_DEFAULTS_LIMIT: 25
AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
DEFAULT_VECTORIZER_MODULE: 'text2vec-contextionary'
ENABLE_MODULES: 'text2vec-contextionary'
CLUSTER_HOSTNAME: 'node1'
contextionary:
environment:
OCCURRENCE_WEIGHT_LINEAR_FACTOR: 0.75
EXTENSIONS_STORAGE_MODE: weaviate
EXTENSIONS_STORAGE_ORIGIN: http://weaviate:8080
NEIGHBOR_OCCURRENCE_IGNORE_PERCENTILE: 5
ENABLE_COMPOUND_SPLITTING: 'false'
image: semitechnologies/contextionary:en0.14.0-v1.2.1
ports:
- 9999:9999
Embedding code
import os
import weaviate
import docx
# Details about the Weaviate instance
weaviate_url = "http://192.168.1.86:8080"
# Define the class name
class_name = "Document"
try:
# Connect to the Weaviate instance
weaviate_client = weaviate.Client(url=weaviate_url)
# Check if the class already exists
existing_classes = weaviate_client.schema.get()['classes']
if not any(cls['class'] == class_name for cls in existing_classes):
# Define the schema for the class if it doesn't exist
schema = {
"classes": [{
"class": class_name,
"properties": [{
"name": "text",
"dataType": ["text"],
"vectorizer": "text2vec-contextionary"
}, {
"name": "filename",
"dataType": ["string"]
}, {
"name": "section",
"dataType": ["int"]
}]
}]
}
weaviate_client.schema.create(schema)
except Exception as e:
print(f"Unable to connect to Weaviate. Please check your URL and network connection. Error: {e}")
exit()
def split_into_sections(text, max_length=512):
"""
Split text into sections with a maximum length of max_length.
"""
words = text.split()
sections = []
current_section = []
for word in words:
current_section.append(word)
if len(' '.join(current_section)) > max_length:
sections.append(' '.join(current_section))
current_section = []
if current_section:
sections.append(' '.join(current_section))
return sections
def extract_text_from_file(file_path):
"""
Extract text content from different types of files (txt, docx, pdf).
"""
file_extension = os.path.splitext(file_path)[1].lower()
if file_extension == ".txt":
with open(file_path, "r", encoding="utf-8") as file:
return file.read()
elif file_extension == ".docx":
doc = docx.Document(file_path)
full_text = [para.text for para in doc.paragraphs]
return '\n'.join(full_text)
elif file_extension == ".pdf":
# Perform text extraction logic from PDF files here
return "Content from pdf file"
else:
print(f"Unsupported file type: {file_extension}")
return None
def embed_files_in_weaviate(file_paths):
"""
Embed text content from files into Weaviate.
"""
for file_path in file_paths:
try:
file_content = extract_text_from_file(file_path)
if file_content is None:
continue
sections = split_into_sections(file_content)
for i, section in enumerate(sections):
data_object = {
"text": section,
"filename": os.path.basename(file_path),
"section": i
}
weaviate_client.data_object.create(data_object, class_name=class_name)
print(f"File {file_path} successfully embedded.")
except Exception as e:
print(f"Error embedding file {file_path}: {e}")
# Set the path to the data folder
data_folder_path = "data" # Update with your data folder path
# Get a list of file paths in the data folder
file_paths_to_embed = [os.path.join(data_folder_path, file) for file in os.listdir(data_folder_path) if os.path.isfile(os.path.join(data_folder_path, file))]
# Embed the files into Weaviate
embed_files_in_weaviate(file_paths_to_embed)
then I try to use nearVector but not success with code
import requests
import weaviate
# Weaviate instance details
weaviate_url = "http://192.168.1.86:8080"
client = weaviate.Client(url=weaviate_url)
# Define the class name
class_name = "Document"
def get_vector_for_text(text):
vectorization_url = f"{weaviate_url}/v1/vectorize"
headers = {'Content-Type': 'application/json'}
data = {"text": text}
try:
response = requests.post(vectorization_url, headers=headers, json=data)
response.raise_for_status() # Raise an HTTPError for bad responses
return response.json().get('vector')
except requests.exceptions.RequestException as e:
print("Error during vectorization:", e)
return None
def query_weaviate_vector(filter_text):
vector = get_vector_for_text(filter_text)
if vector is None:
return "Unable to get vector for text."
graphql_query = f"""
{{
Get {{
Document(
nearVector: {{
vector: {vector}
}}
) {{
text
filename
section
}}
}}
}}
"""
try:
results = client.query.raw(graphql_query)
hints = ""
for result in results.get("data", {}).get("Get", {}).get("Document", []):
hints += f"Filename: {result.get('filename')}, Section: {result.get('section')}\n{result.get('text')}\n\n"
return hints
except Exception as e:
print("Error during query:", e)
return None
# Example usage of the query function
filter_text = "Teacher Uncle Ho"
hints = query_weaviate_vector(filter_text)
print("Hints:")
print(hints)
and get error
Error during vectorization: 404 Client Error: Not Found for url: http://192.168.1.86:8080/v1/vectorize
Hints:
Unable to get vector for text.
please help me to solve this problem.
Thanks