Hi Team,
Please find below conf files we are using.
{
“cells”: [
{
“cell_type”: “code”,
“execution_count”: 16,
“metadata”: {
“tags”:
},
“outputs”: ,
“source”: [
“import os\n”,
“import openai\n”,
“import json\n”,
“from langchain.retrievers import AzureCognitiveSearchRetriever\n”,
“from langchain.embeddings import OpenAIEmbeddings\n”,
“from langchain.llms import AzureOpenAI\n”,
“from langchain.chains import RetrievalQA\n”,
“from langchain.vectorstores import DocArrayInMemorySearch\n”,
“from langchain.chat_models import ChatOpenAI\n”,
“from langchain.text_splitter import (RecursiveCharacterTextSplitter,Language)\n”,
“from langchain.document_loaders import TextLoader\n”,
“from langchain.embeddings.openai import OpenAIEmbeddings\n”,
“from langchain.schema import BaseRetriever\n”,
“from langchain.vectorstores.azuresearch import AzureSearch\n”,
“from langchain.vectorstores import Weaviate\n”,
“import weaviate\n”,
“from concurrent.futures import ThreadPoolExecutor\n”,
“from tqdm import tqdm\n”,
“from openai.error import RateLimitError\n”,
“import time\n”,
“import glob\n”,
“# from docx2python import docx2python\n”,
“#import PyPDF2\n”,
“\n”,
“# from app.openailogging import OpenAILogger\n”,
“# logger = OpenAILogger(‘promptresponselogs’, ‘prompt-response2.json’)\n”,
“# os.environ["AZURE_COGNITIVE_SEARCH_SERVICE_NAME"] = "az-cogsearch-openai-uswst"\n”,
“# os.environ["AZURE_COGNITIVE_SEARCH_INDEX_NAME"] = "codebase-pdf-test-uswst"\n”,
“os.environ["AZURE_COGNITIVE_SEARCH_INDEX_NAME"] = "EmbeddinganalysisvectorbatchTest"\n”,
“# os.environ["AZURE_COGNITIVE_SEARCH_API_KEY"] = ‘uK14Zy2rBdNRPGhhKx36XHxQWpd2CPdDNSjuWQEPdHAzSeDJYFQv’\n”,
“os.environ[‘OPENAI_API_KEY’] = ‘85a8c367d65e435599a4831c2c808013’\n”,
“os.environ[‘OPENAI_API_TYPE’] = "azure"\n”,
“\n”,
“os.environ[‘OPENAI_API_BASE’] = "https://openai-useast-dev.openai.azure.com/\“\n”,
“openai.api_base = os.environ[‘OPENAI_API_BASE’]\n”,
“openai.api_type = "azure"\n”,
“openai.api_key = os.environ[‘OPENAI_API_KEY’]\n”,
“openai.api_version = "2023-05-15"\n”,
“model: str = "text-embedding-ada-002"”
]
},
{
“cell_type”: “markdown”,
“metadata”: {},
“source”: [
“PLEASE ADD DIFFERENT INDEX NAME TO AVOID OVERWRITING IN THE BELOW CELL”
]
},
{
“cell_type”: “code”,
“execution_count”: 17,
“metadata”: {
“tags”:
},
“outputs”: ,
“source”: [
“vector_store_address: str = "https://az-cogsearch-openai-uswst.search.windows.net"\n”,
“# vector_store_password: str = os.environ["AZURE_COGNITIVE_SEARCH_API_KEY"]\n”,
“index_name: str = os.environ["AZURE_COGNITIVE_SEARCH_INDEX_NAME"]”
]
},
{
“cell_type”: “code”,
“execution_count”: 18,
“metadata”: {
“tags”:
},
“outputs”: ,
“source”: [
“embeddings: OpenAIEmbeddings = OpenAIEmbeddings(model=model, chunk_size=1, max_retries=100)\n”,
“client = weaviate.Client("http://<hostname_replaced>:8080",timeout_config=(10,15))\n”,
“client.batch.configure(batch_size=50) # Configure batch\n”,
“vector_store: Weaviate = Weaviate(\n”,
" client=client,\n”,
" index_name=index_name,\n",
" text_key="content",\n",
" embedding=embeddings\n",
“)”
]
},
{
“cell_type”: “code”,
“execution_count”: 19,
“metadata”: {
“tags”:
},
“outputs”: ,
“source”: [
“# embeddings = OpenAIEmbeddings(deployment="text-embedding-ada-002",openai_api_version="2023-03-15-preview", chunk_size=1)\n”,
“args ={\n”,
" "engine": "gpt35turbo",\n",
“}\n”,
“llm = ChatOpenAI(temperature=0.3, model_kwargs=args, verbose=False)\n”,
“# char_text_splitter = RecursiveCharacterTextSplitter.from_language(language=Language.PYTHON, chunk_size=2000, chunk_overlap=5)\n”,
“char_text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=5)”
]
},
{
“cell_type”: “code”,
“execution_count”: 20,
“metadata”: {
“tags”:
},
“outputs”: ,
“source”: [
“def read_filenames(path,extensions):\n”,
" files = \n",
" for extension in extensions:\n",
" files.extend(glob.glob(path + ‘/**/*’ + extension, recursive=True))\n",
" return files"
]
},
{
“cell_type”: “code”,
“execution_count”: 21,
“metadata”: {},
“outputs”: ,
“source”: [
“# def read_files(files):\n”,
“# file_contents = {}\n”,
“# for file in files:\n”,
“# try:\n”,
“# pdfReader = PyPDF2.PdfReader(file)\n”,
“# number_of_pages = len(pdfReader.pages)\n”,
“# file_content = b’'\n”,
“# for page_num in range(0,number_of_pages):\n”,
“# pagehandle = pdfReader.pages[page_num]\n”,
“# file_content += pagehandle.extract_text().encode(‘UTF-8’)\n”,
“# filename = "TEXT-" + os.path.basename(file).replace(‘.pdf’, ‘.txt’)\n”,
“# writing_path = r’C:\acs\dataset\benchmark\ARK_DUMP\dump_all’\n”,
“# file_contents[filename] = file_content\n”,
“# with open(writing_path+"\\"+filename, ‘wb’) as f:\n”,
“# f.write(file_content)\n”,
“# except Exception as e:\n”,
“# print(f"Error processing PDF file{file}:{str(e)}")\n”,
“# return file_contents\n”,
“\n”,
“# read the files and store them in a list\n”,
“def read_files(files):\n”,
" file_contents = {}\n",
" for file in files:\n",
" with open(file, ‘rb’) as f:\n",
" writing_path = r’C:\Software\weaveit\arc\newKbFin_18k_snow\newKbFin’\n",
" file_contents[os.path.join(writing_path, os.path.basename(file))] = f.read()\n",
" return file_contents\n"
]
},
{
“cell_type”: “code”,
“execution_count”: 22,
“metadata”: {},
“outputs”: [
{
“name”: “stdout”,
“output_type”: “stream”,
“text”: [
" File Extension Count\n",
“0 .txt 17766\n”
]
}
],
“source”: [
“import os \n”,
“import pandas as pd \n”,
" \n",
“# Define the directory you want to start your search from \n”,
“root_dir = r’C:\Software\weaveit\arc\newKbFin_18k_snow\newKbFin’\n”,
“\n”,
" \n",
“# Define an empty dictionary to store the file extensions and counts \n”,
“file_extensions = {} \n”,
" \n",
“# Traverse through all the directories and files in the root_dir \n”,
“for subdir, dirs, files in os.walk(root_dir): \n”,
" for file in files: \n",
" # Get the file extension \n",
" file_ext = os.path.splitext(file)[-1].lower() \n",
" # Add the file extension to the dictionary and increment the count \n",
" if file_ext in file_extensions: \n",
" file_extensions[file_ext] += 1 \n",
" else: \n",
" file_extensions[file_ext] = 1 \n",
" \n",
“# Convert the dictionary to a pandas dataframe and sort by count in descending order \n”,
“df = pd.DataFrame(list(file_extensions.items()), columns=[‘File Extension’, ‘Count’]).sort_values(‘Count’, ascending=False) \n”,
" \n",
“# Print the dataframe \n”,
“print(df) "
]
},
{
“cell_type”: “code”,
“execution_count”: 23,
“metadata”: {},
“outputs”: [
{
“name”: “stdout”,
“output_type”: “stream”,
“text”: [
“Length of files: 17766\n”
]
}
],
“source”: [
“path = r’C:\Software\weaveit\arc\newKbFin_18k_snow\newKbFin’\n”,
“\n”,
“extensions = [‘.txt’]\n”,
“files = read_filenames(path=path, extensions=extensions)\n”,
“# print(files)\n”,
“print("Length of files: ", len(files))”
]
},
{
“cell_type”: “code”,
“execution_count”: 24,
“metadata”: {},
“outputs”: ,
“source”: [
“contents = read_files(files)”
]
},
{
“cell_type”: “code”,
“execution_count”: 25,
“metadata”: {},
“outputs”: ,
“source”: [
“# contents”
]
},
{
“cell_type”: “code”,
“execution_count”: null,
“metadata”: {},
“outputs”: [
{
“name”: “stderr”,
“output_type”: “stream”,
“text”: [
" 0%| | 15/17766 [00:11<2:19:30, 2.12it/s]C:\Users\x0135069\AppData\Local\Programs\Python\Python310\lib\site-packages\weaviate\batch\crud_batch.py:1086: RuntimeWarning: The BatchExecutor was shutdown, most probably when it exited the with
statement. It will be initialized again. If you are not batch
in the with client.batch as batch
please make sure to shut it down when done importing data: client.batch.shutdown()
. You can start it again using the client.batch.start()
method.\n”,
" warnings.warn(\n",
" 5%|████ | 949/17766 [05:36<1:30:07, 3.11it/s][ERROR] Batch ConnectionError Exception occurred! Retrying in 2s. [1/3]\n",
" 14%|██████████▎ | 2449/17766 [14:28<5:43:21, 1.34s/it]Exception in thread batchSizeRefresh:\n",
“Traceback (most recent call last):\n”,
" File "C:\Users\x0135069\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connectionpool.py", line 790, in urlopen\n",
" response = self._make_request(\n",
" File "C:\Users\x0135069\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connectionpool.py", line 536, in _make_request\n",
" response = conn.getresponse()\n",
" File "C:\Users\x0135069\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connection.py", line 454, in getresponse\n",
" httplib_response = super().getresponse()\n",
" File "C:\Users\x0135069\AppData\Local\Programs\Python\Python310\lib\http\client.py", line 1374, in getresponse\n",
" response.begin()\n",
" File "C:\Users\x0135069\AppData\Local\Programs\Python\Python310\lib\http\client.py", line 318, in begin\n",
" version, status, reason = self._read_status()\n",
" File "C:\Users\x0135069\AppData\Local\Programs\Python\Python310\lib\http\client.py", line 287, in _read_status\n",
" raise RemoteDisconnected("Remote end closed connection without"\n",
“http.client.RemoteDisconnected: Remote end closed connection without response\n”,
“\n”,
“During handling of the above exception, another exception occurred:\n”,