Use nearVector?

Manuel_ahedo · January 23, 2024, 10:06am

Hello
I am new to weaviate, I try to use below code embedding some documents to a class using DEFAULT_VECTORIZER_MODULE: ‘text2vec-contextionary’
192.168.1.86:8080 is my weaviate server.
My configure file for the server

    - --scheme
    - http
    image: semitechnologies/weaviate:1.23.2
    ports:
    - 8080:8080
    - 50051:50051
    volumes:
    - ./weaviate_data:/var/lib/weaviate
    restart: on-failure:0
    environment:
      CONTEXTIONARY_URL: contextionary:9999
      QUERY_DEFAULTS_LIMIT: 25
      AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
      PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
      DEFAULT_VECTORIZER_MODULE: 'text2vec-contextionary'
      ENABLE_MODULES: 'text2vec-contextionary'
      CLUSTER_HOSTNAME: 'node1'
  contextionary:
    environment:
      OCCURRENCE_WEIGHT_LINEAR_FACTOR: 0.75
      EXTENSIONS_STORAGE_MODE: weaviate
      EXTENSIONS_STORAGE_ORIGIN: http://weaviate:8080
      NEIGHBOR_OCCURRENCE_IGNORE_PERCENTILE: 5
      ENABLE_COMPOUND_SPLITTING: 'false'
    image: semitechnologies/contextionary:en0.14.0-v1.2.1
    ports:
    - 9999:9999

Embedding code

import os
import weaviate
import docx

# Details about the Weaviate instance
weaviate_url = "http://192.168.1.86:8080"

# Define the class name
class_name = "Document"

try:
    # Connect to the Weaviate instance
    weaviate_client = weaviate.Client(url=weaviate_url)

    # Check if the class already exists
    existing_classes = weaviate_client.schema.get()['classes']
    if not any(cls['class'] == class_name for cls in existing_classes):
        # Define the schema for the class if it doesn't exist
        schema = {
            "classes": [{
                "class": class_name,
                "properties": [{
                    "name": "text",
                    "dataType": ["text"],
                    "vectorizer": "text2vec-contextionary"
                }, {
                    "name": "filename",
                    "dataType": ["string"]
                }, {
                    "name": "section",
                    "dataType": ["int"]
                }]
            }]
        }
        weaviate_client.schema.create(schema)

except Exception as e:
    print(f"Unable to connect to Weaviate. Please check your URL and network connection. Error: {e}")
    exit()

def split_into_sections(text, max_length=512):
    """
    Split text into sections with a maximum length of max_length.
    """
    words = text.split()
    sections = []
    current_section = []

    for word in words:
        current_section.append(word)
        if len(' '.join(current_section)) > max_length:
            sections.append(' '.join(current_section))
            current_section = []
    
    if current_section:
        sections.append(' '.join(current_section))
    
    return sections

def extract_text_from_file(file_path):
    """
    Extract text content from different types of files (txt, docx, pdf).
    """
    file_extension = os.path.splitext(file_path)[1].lower()

    if file_extension == ".txt":
        with open(file_path, "r", encoding="utf-8") as file:
            return file.read()
    elif file_extension == ".docx":
        doc = docx.Document(file_path)
        full_text = [para.text for para in doc.paragraphs]
        return '\n'.join(full_text)
    elif file_extension == ".pdf":
        # Perform text extraction logic from PDF files here
        return "Content from pdf file"
    else:
        print(f"Unsupported file type: {file_extension}")
        return None

def embed_files_in_weaviate(file_paths):
    """
    Embed text content from files into Weaviate.
    """
    for file_path in file_paths:
        try:
            file_content = extract_text_from_file(file_path)
            if file_content is None:
                continue

            sections = split_into_sections(file_content)
            for i, section in enumerate(sections):
                data_object = {
                    "text": section,
                    "filename": os.path.basename(file_path),
                    "section": i
                }

                weaviate_client.data_object.create(data_object, class_name=class_name)

            print(f"File {file_path} successfully embedded.")

        except Exception as e:
            print(f"Error embedding file {file_path}: {e}")

# Set the path to the data folder
data_folder_path = "data"  # Update with your data folder path

# Get a list of file paths in the data folder
file_paths_to_embed = [os.path.join(data_folder_path, file) for file in os.listdir(data_folder_path) if os.path.isfile(os.path.join(data_folder_path, file))]

# Embed the files into Weaviate
embed_files_in_weaviate(file_paths_to_embed)

then I try to use nearVector but not success with code

import requests
import weaviate

# Weaviate instance details
weaviate_url = "http://192.168.1.86:8080"
client = weaviate.Client(url=weaviate_url)

# Define the class name
class_name = "Document"

def get_vector_for_text(text):
    vectorization_url = f"{weaviate_url}/v1/vectorize"
    headers = {'Content-Type': 'application/json'}
    data = {"text": text}
    
    try:
        response = requests.post(vectorization_url, headers=headers, json=data)
        response.raise_for_status()  # Raise an HTTPError for bad responses
        return response.json().get('vector')
    except requests.exceptions.RequestException as e:
        print("Error during vectorization:", e)
        return None

def query_weaviate_vector(filter_text):
    vector = get_vector_for_text(filter_text)
    
    if vector is None:
        return "Unable to get vector for text."

    graphql_query = f"""
    {{
      Get {{
        Document(
          nearVector: {{
            vector: {vector}
          }}
        ) {{
          text
          filename
          section
        }}
      }}
    }}
    """
    
    try:
        results = client.query.raw(graphql_query)
        hints = ""
        for result in results.get("data", {}).get("Get", {}).get("Document", []):
            hints += f"Filename: {result.get('filename')}, Section: {result.get('section')}\n{result.get('text')}\n\n"
        return hints
    except Exception as e:
        print("Error during query:", e)
        return None

# Example usage of the query function
filter_text = "Teacher Uncle Ho"
hints = query_weaviate_vector(filter_text)
print("Hints:")
print(hints)

and get error

Error during vectorization: 404 Client Error: Not Found for url: http://192.168.1.86:8080/v1/vectorize
Hints:
Unable to get vector for text.

please help me to solve this problem.
Thanks

DudaNogueira · January 30, 2024, 1:35pm

Hi Manuel!! Welcome to our community

Sorry for the delay here

I have never played too much with this module. What I have found that the path to vectorization using the text2vec-contextionary is:

GET http://localhost:8080/v1/modules/text2vec-contextionary/concepts/**something**

Also, notice, it will only vectorize words, on this case something.

But notice that you don’t need to vectorize the object yourself. If your class has text2vec-contextionary as it’s vectorizer, Weaviate will vectorize it for your, and also allow you to do neatText queries.

Also note that contextionary is a fairly old (but fast) tech. You can probably be better served with, for example:

Let me know if this helps.

Thanks!

Topic		Replies	Views
Near_text with own embeddings Support	1	224	February 12, 2024
How can I get a vector for `nearVector` if I have a vectorizer? General	1	283	October 10, 2023
Near_text() with my own vectorizor Support python	5	385	June 21, 2024
Is nearText() completely a Weaviate cloud operation, no outbound LLM call? General wcs	12	527	April 22, 2024
Near_vector() input validator bug? Support bug	2	155	January 24, 2025

Use nearVector?

Related topics