[ERROR] Batch ConnectionError Exception occurred!

Hi Team,

Getting below error, on my initial testing and loading the data. Please suggest it any env variables need to change from DB end or in python code.

packages\weaviate\batch\crud_batch.py:1086: RuntimeWarning: The BatchExecutor was shutdown, most probably when it exited the with statement. It will be initialized again. If you are not batch in the with client.batch as batch please make sure to shut it down when done importing data: client.batch.shutdown(). You can start it again using the client.batch.start() method.
warnings.warn(
2%|█▊ | 430/17766 [05:26<5:50:01, 1.21s/it][ERROR] Batch ConnectionError Exception occurred! Retrying in 2s. [1/3]
3%|██▍ | 573/17766 [06:43<2:41:42, 1.77it/s][ERROR] Batch ConnectionError Exception occurred! Retrying in 2s. [1/3]
12%|████████▋ | 2069/17766 [22:46<3:01:53, 1.44it/s][ERROR] Batch ConnectionError Exception occurred! Retrying in 2s. [1/3]
13%|█████████▌ | 2254/17766 [25:20<3:47:45, 1.14it/s]Exception in thread batchSizeRefresh:
Traceback (most recent call last):
File “C:\Users\x0135069\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connectionpool.py”, line 790, in urlopen
response = self._make_request(
File “C:\Users\x0135069\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connectionpool.py”, line 536, in _make_request
response = conn.getresponse()
File “C:\Users\x0135069\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connection.py”, line 454, in getresponse
httplib_response = super().getresponse()
File “C:\Users\x0135069\AppData\Local\Programs\Python\Python310\lib\http\client.py”, line 1374, in getresponse
response.begin()
File “C:\Users\x0135069\AppData\Local\Programs\Python\Python310\lib\http\client.py”, line 318, in begin
version, status, reason = self._read_status()
File “C:\Users\x0135069\AppData\Local\Programs\Python\Python310\lib\http\client.py”, line 287, in _read_status
raise RemoteDisconnected(“Remote end closed connection without”
http.client.RemoteDisconnected: Remote end closed connection without response

During handling of the above exception, another exception occurred:

Hi!

This looks like some network error.

Can you share how you are importing your data?

Hi Team,

Please find below conf files we are using.

{
“cells”: [
{
“cell_type”: “code”,
“execution_count”: 16,
“metadata”: {
“tags”:
},
“outputs”: ,
“source”: [
“import os\n”,
“import openai\n”,
“import json\n”,
“from langchain.retrievers import AzureCognitiveSearchRetriever\n”,
“from langchain.embeddings import OpenAIEmbeddings\n”,
“from langchain.llms import AzureOpenAI\n”,
“from langchain.chains import RetrievalQA\n”,
“from langchain.vectorstores import DocArrayInMemorySearch\n”,
“from langchain.chat_models import ChatOpenAI\n”,
“from langchain.text_splitter import (RecursiveCharacterTextSplitter,Language)\n”,
“from langchain.document_loaders import TextLoader\n”,
“from langchain.embeddings.openai import OpenAIEmbeddings\n”,
“from langchain.schema import BaseRetriever\n”,
“from langchain.vectorstores.azuresearch import AzureSearch\n”,
“from langchain.vectorstores import Weaviate\n”,
“import weaviate\n”,
“from concurrent.futures import ThreadPoolExecutor\n”,
“from tqdm import tqdm\n”,
“from openai.error import RateLimitError\n”,
“import time\n”,
“import glob\n”,
“# from docx2python import docx2python\n”,
#import PyPDF2\n”,
“\n”,
“# from app.openailogging import OpenAILogger\n”,
“# logger = OpenAILogger(‘promptresponselogs’, ‘prompt-response2.json’)\n”,
“# os.environ["AZURE_COGNITIVE_SEARCH_SERVICE_NAME"] = "az-cogsearch-openai-uswst"\n”,
“# os.environ["AZURE_COGNITIVE_SEARCH_INDEX_NAME"] = "codebase-pdf-test-uswst"\n”,
“os.environ["AZURE_COGNITIVE_SEARCH_INDEX_NAME"] = "EmbeddinganalysisvectorbatchTest"\n”,
“# os.environ["AZURE_COGNITIVE_SEARCH_API_KEY"] = ‘uK14Zy2rBdNRPGhhKx36XHxQWpd2CPdDNSjuWQEPdHAzSeDJYFQv’\n”,
“os.environ[‘OPENAI_API_KEY’] = ‘85a8c367d65e435599a4831c2c808013’\n”,
“os.environ[‘OPENAI_API_TYPE’] = "azure"\n”,
“\n”,
“os.environ[‘OPENAI_API_BASE’] = "https://openai-useast-dev.openai.azure.com/\“\n”,
“openai.api_base = os.environ[‘OPENAI_API_BASE’]\n”,
“openai.api_type = "azure"\n”,
“openai.api_key = os.environ[‘OPENAI_API_KEY’]\n”,
“openai.api_version = "2023-05-15"\n”,
“model: str = "text-embedding-ada-002"”
]
},
{
“cell_type”: “markdown”,
“metadata”: {},
“source”: [
“PLEASE ADD DIFFERENT INDEX NAME TO AVOID OVERWRITING IN THE BELOW CELL”
]
},
{
“cell_type”: “code”,
“execution_count”: 17,
“metadata”: {
“tags”:
},
“outputs”: ,
“source”: [
“vector_store_address: str = "https://az-cogsearch-openai-uswst.search.windows.net"\n”,
“# vector_store_password: str = os.environ["AZURE_COGNITIVE_SEARCH_API_KEY"]\n”,
“index_name: str = os.environ["AZURE_COGNITIVE_SEARCH_INDEX_NAME"]”
]
},
{
“cell_type”: “code”,
“execution_count”: 18,
“metadata”: {
“tags”:
},
“outputs”: ,
“source”: [
“embeddings: OpenAIEmbeddings = OpenAIEmbeddings(model=model, chunk_size=1, max_retries=100)\n”,
“client = weaviate.Client("http://<hostname_replaced>:8080",timeout_config=(10,15))\n”,
“client.batch.configure(batch_size=50) # Configure batch\n”,
“vector_store: Weaviate = Weaviate(\n”,
" client=client,\n”,
" index_name=index_name,\n",
" text_key="content",\n",
" embedding=embeddings\n",
“)”
]
},
{
“cell_type”: “code”,
“execution_count”: 19,
“metadata”: {
“tags”:
},
“outputs”: ,
“source”: [
“# embeddings = OpenAIEmbeddings(deployment="text-embedding-ada-002",openai_api_version="2023-03-15-preview", chunk_size=1)\n”,
“args ={\n”,
" "engine": "gpt35turbo",\n",
“}\n”,
“llm = ChatOpenAI(temperature=0.3, model_kwargs=args, verbose=False)\n”,
“# char_text_splitter = RecursiveCharacterTextSplitter.from_language(language=Language.PYTHON, chunk_size=2000, chunk_overlap=5)\n”,
“char_text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=5)”
]
},
{
“cell_type”: “code”,
“execution_count”: 20,
“metadata”: {
“tags”:
},
“outputs”: ,
“source”: [
“def read_filenames(path,extensions):\n”,
" files = \n",
" for extension in extensions:\n",
" files.extend(glob.glob(path + ‘/**/*’ + extension, recursive=True))\n",
" return files"
]
},
{
“cell_type”: “code”,
“execution_count”: 21,
“metadata”: {},
“outputs”: ,
“source”: [
“# def read_files(files):\n”,
“# file_contents = {}\n”,
“# for file in files:\n”,
“# try:\n”,
“# pdfReader = PyPDF2.PdfReader(file)\n”,
“# number_of_pages = len(pdfReader.pages)\n”,
“# file_content = b’'\n”,
“# for page_num in range(0,number_of_pages):\n”,
“# pagehandle = pdfReader.pages[page_num]\n”,
“# file_content += pagehandle.extract_text().encode(‘UTF-8’)\n”,
“# filename = "TEXT-" + os.path.basename(file).replace(‘.pdf’, ‘.txt’)\n”,
“# writing_path = r’C:\acs\dataset\benchmark\ARK_DUMP\dump_all’\n”,
“# file_contents[filename] = file_content\n”,
“# with open(writing_path+"\\"+filename, ‘wb’) as f:\n”,
“# f.write(file_content)\n”,
“# except Exception as e:\n”,
“# print(f"Error processing PDF file{file}:{str(e)}")\n”,
“# return file_contents\n”,
“\n”,
“# read the files and store them in a list\n”,
“def read_files(files):\n”,
" file_contents = {}\n",
" for file in files:\n",
" with open(file, ‘rb’) as f:\n",
" writing_path = r’C:\Software\weaveit\arc\newKbFin_18k_snow\newKbFin’\n",
" file_contents[os.path.join(writing_path, os.path.basename(file))] = f.read()\n",
" return file_contents\n"
]
},
{
“cell_type”: “code”,
“execution_count”: 22,
“metadata”: {},
“outputs”: [
{
“name”: “stdout”,
“output_type”: “stream”,
“text”: [
" File Extension Count\n",
“0 .txt 17766\n”
]
}
],
“source”: [
“import os \n”,
“import pandas as pd \n”,
" \n",
“# Define the directory you want to start your search from \n”,
“root_dir = r’C:\Software\weaveit\arc\newKbFin_18k_snow\newKbFin’\n”,
“\n”,
" \n",
“# Define an empty dictionary to store the file extensions and counts \n”,
“file_extensions = {} \n”,
" \n",
“# Traverse through all the directories and files in the root_dir \n”,
“for subdir, dirs, files in os.walk(root_dir): \n”,
" for file in files: \n",
" # Get the file extension \n",
" file_ext = os.path.splitext(file)[-1].lower() \n",
" # Add the file extension to the dictionary and increment the count \n",
" if file_ext in file_extensions: \n",
" file_extensions[file_ext] += 1 \n",
" else: \n",
" file_extensions[file_ext] = 1 \n",
" \n",
“# Convert the dictionary to a pandas dataframe and sort by count in descending order \n”,
“df = pd.DataFrame(list(file_extensions.items()), columns=[‘File Extension’, ‘Count’]).sort_values(‘Count’, ascending=False) \n”,
" \n",
“# Print the dataframe \n”,
“print(df) "
]
},
{
“cell_type”: “code”,
“execution_count”: 23,
“metadata”: {},
“outputs”: [
{
“name”: “stdout”,
“output_type”: “stream”,
“text”: [
“Length of files: 17766\n”
]
}
],
“source”: [
“path = r’C:\Software\weaveit\arc\newKbFin_18k_snow\newKbFin’\n”,
“\n”,
“extensions = [‘.txt’]\n”,
“files = read_filenames(path=path, extensions=extensions)\n”,
“# print(files)\n”,
“print("Length of files: ", len(files))”
]
},
{
“cell_type”: “code”,
“execution_count”: 24,
“metadata”: {},
“outputs”: ,
“source”: [
“contents = read_files(files)”
]
},
{
“cell_type”: “code”,
“execution_count”: 25,
“metadata”: {},
“outputs”: ,
“source”: [
“# contents”
]
},
{
“cell_type”: “code”,
“execution_count”: null,
“metadata”: {},
“outputs”: [
{
“name”: “stderr”,
“output_type”: “stream”,
“text”: [
" 0%| | 15/17766 [00:11<2:19:30, 2.12it/s]C:\Users\x0135069\AppData\Local\Programs\Python\Python310\lib\site-packages\weaviate\batch\crud_batch.py:1086: RuntimeWarning: The BatchExecutor was shutdown, most probably when it exited the with statement. It will be initialized again. If you are not batch in the with client.batch as batch please make sure to shut it down when done importing data: client.batch.shutdown(). You can start it again using the client.batch.start() method.\n”,
" warnings.warn(\n",
" 5%|████ | 949/17766 [05:36<1:30:07, 3.11it/s][ERROR] Batch ConnectionError Exception occurred! Retrying in 2s. [1/3]\n",
" 14%|██████████▎ | 2449/17766 [14:28<5:43:21, 1.34s/it]Exception in thread batchSizeRefresh:\n",
“Traceback (most recent call last):\n”,
" File "C:\Users\x0135069\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connectionpool.py", line 790, in urlopen\n",
" response = self._make_request(\n",
" File "C:\Users\x0135069\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connectionpool.py", line 536, in _make_request\n",
" response = conn.getresponse()\n",
" File "C:\Users\x0135069\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connection.py", line 454, in getresponse\n",
" httplib_response = super().getresponse()\n",
" File "C:\Users\x0135069\AppData\Local\Programs\Python\Python310\lib\http\client.py", line 1374, in getresponse\n",
" response.begin()\n",
" File "C:\Users\x0135069\AppData\Local\Programs\Python\Python310\lib\http\client.py", line 318, in begin\n",
" version, status, reason = self._read_status()\n",
" File "C:\Users\x0135069\AppData\Local\Programs\Python\Python310\lib\http\client.py", line 287, in _read_status\n",
" raise RemoteDisconnected("Remote end closed connection without"\n",
“http.client.RemoteDisconnected: Remote end closed connection without response\n”,
“\n”,
“During handling of the above exception, another exception occurred:\n”,

Is that a jupyter notebook?

Yes it is Jupiter we are using.

Hi @DudaNogueira ,

Can you please let me know if any issues with Jupiter? or anything need to change in the configurations.

Hi Sorry for the delay here :slight_smile:

This notebook is note valid :frowning:

How big is the data? maybe you’ll need to throttle it.

I can see you are using langchain. For now it is doing at least one unecessary class check for each added text this adds some overhead to the import.

What you could try is to ingest the data yourself, following this doc here:

Let me know if this helps :slight_smile: