chore: init

faizahmedfarooqui · Apr 16, 2023 · 50dbc25 · 50dbc25
commit 50dbc25
Show file tree

Hide file tree

Showing 5 changed files with 259 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,82 @@
+# AI in CLI
+
+## Built using Llama & OpenAI using OpenAI Chat Model
+
+## What does this repository contains?
+
+- This repository contains two files ie. `construct.py` & `run.py` and these files helps in constructing your data into OpenAI and getting results for your prompt from OpenAI Chat Model
+- `construct.py` helps to create the embedding from your files available in `docs` directory
+- `run.py` runs your prompt through the same embedding
+- `docs` directory contains your training-data or knowledge-base. You can keep below mentioned file types in your `docs` directory -
+  - ".pdf"
+  - ".docx"
+  - ".pptx"
+  - ".jpg"
+  - ".png"
+  - ".jpeg"
+  - ".mp3"
+  - ".mp4"
+  - ".csv"
+  - ".epub"
+  - ".md"
+  - ".mbox"
+
+## What are OpenAI Embedding?
+
+OpenAI’s text embeddings measure the relatedness of text strings. Embeddings are commonly used for:
+
+- **Search** (where results are ranked by relevance to a query string)
+- **Clustering** (where text strings are grouped by similarity)
+- **Recommendations** (where items with related text strings are recommended)
+- **Anomaly detection** (where outliers with little relatedness are identified)
+- **Diversity measurement** (where similarity distributions are analyzed)
+- **Classification** (where text strings are classified by their most similar label)
+
+An embedding is a vector (list) of floating point numbers. The distance between two vectors measures their relatedness. Small distances suggest high relatedness and large distances suggest low relatedness.
+
+## What are the Prerequisites?
+
+- OpenAI API Key
+- Python version >= 3.11.3
+- Pip version >= 23.0.1 from python (3.11)
+
+If you have the correct version of Python & Pip installed, you can use the following command to install the dependencies -
+```bash
+pip install -r requirements.txt
+```
+
+## Construct.py:
+
+- This script constructs an index using the GPTSimpleVectorIndex from documents in a specified directory.
+- The LLMPredictor is used to generate embeddings for the documents, and a PromptHelper is used to generate prompts for the LLMPredictor. The constructed index is saved to disk in JSON format.
+
+### Usage:
+
+Run the script and keep the docs at the level same as this script.
+
+```bash
+export OPENAI_API_KEY="sk-xxx...xxx" python construct.py
+```
+
+Note:
+1. Make sure to set up the dependencies and provide the necessary API credentials before running the script.
+2. Once your `construct.py` is successfully run, then you should see an index.json file containing embeddings.
+
+## Run.py:
+
+1. This script implements a chatbot using the GPTSimpleVectorIndex from a pre-constructed index.
+2. The chatbot takes input text as a command line argument and generates a response by querying
+the index.
+3. The SentenceEmbeddingOptimizer is used to optimize the responses. The generated response is
+printed to the console with special markers indicating the start and end of the response.
+
+### Usage:
+
+Run the script with the input text as the command line argument. For example:
+
+```bash
+export OPENAI_API_KEY="sk-xxx...xxx" python run.py "Hello, how are you?"
+```
+
+Note:
+Make sure to provide the path to the pre-constructed index in the `GPTSimpleVectorIndex.load_from_disk()` method in the chatbot() function. Also, the chatbot may take some time to generate a response, depending on the complexity of the query and the size of the index.
diff --git a/construct.py b/construct.py
@@ -0,0 +1,30 @@
+"""
+File Name: construct.py
+Author: Faiz A. Farooqui (github.com/faizahmedfarooqui)
+"""
+
+from llama_index import LLMPredictor, GPTSimpleVectorIndex, PromptHelper, ServiceContext, SimpleDirectoryReader
+from langchain.chat_models import ChatOpenAI
+
+def construct_index(directory_path):
+	max_input_size = 4096
+	num_outputs = 2047
+	max_chunk_overlap = 10
+	chunk_size_limit = 600
+
+	prompt_helper = PromptHelper(max_input_size, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit)
+
+	llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.4, model_name="gpt-4"))
+
+	service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper)
+
+	documents = SimpleDirectoryReader(directory_path).load_data()
+
+	index = GPTSimpleVectorIndex.from_documents(documents, service_context=service_context)
+
+	index.save_to_disk('openai/index.json')
+
+	return index
+
+if __name__ == '__main__':
+	index = construct_index("docs")
diff --git a/docs/your-documentation-goes-here.md b/docs/your-documentation-goes-here.md
@@ -0,0 +1,2 @@
+
+You can write your documentation here.
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,107 @@
+aiofiles==23.1.0
+aiohttp==3.8.4
+aiosignal==1.3.1
+altair==4.2.2
+anyio==3.6.2
+argilla==1.6.0
+async-timeout==4.0.2
+attrs==22.2.0
+backoff==2.2.1
+beautifulsoup4==4.12.2
+black==23.3.0
+bs4==0.0.1
+certifi==2022.12.7
+charset-normalizer==3.1.0
+click==8.1.3
+commonmark==0.9.1
+contourpy==1.0.7
+cycler==0.11.0
+dataclasses-json==0.5.7
+Deprecated==1.2.13
+entrypoints==0.4
+et-xmlfile==1.1.0
+faiss-cpu==1.7.3
+fastapi==0.95.0
+ffmpy==0.3.0
+filelock==3.11.0
+fonttools==4.39.3
+frozenlist==1.3.3
+fsspec==2023.4.0
+h11==0.14.0
+httpcore==0.16.3
+httpx==0.23.3
+huggingface-hub==0.13.4
+idna==3.4
+isort==5.12.0
+Jinja2==3.1.2
+joblib==1.2.0
+jsonschema==4.17.3
+kiwisolver==1.4.4
+langchain==0.0.136
+libmagic==1.0
+linkify-it-py==2.0.0
+llama-index==0.5.15
+lxml==4.9.2
+Markdown==3.4.3
+markdown-it-py==2.2.0
+MarkupSafe==2.1.2
+marshmallow==3.19.0
+marshmallow-enum==1.5.1
+matplotlib==3.7.1
+mdit-py-plugins==0.3.3
+mdurl==0.1.2
+monotonic==1.6
+msg-parser==1.2.0
+multidict==6.0.4
+mypy-extensions==1.0.0
+nltk==3.8.1
+numpy==1.23.5
+olefile==0.46
+openai==0.27.4
+openapi-schema-pydantic==1.2.4
+openpyxl==3.1.2
+orjson==3.8.10
+packaging==23.0
+pandas==1.5.3
+pathspec==0.11.1
+Pillow==9.5.0
+platformdirs==3.2.0
+pycryptodome==3.17
+pydantic==1.10.7
+pydub==0.25.1
+Pygments==2.14.0
+pypandoc==1.11
+pyparsing==3.0.9
+PyPDF2==3.0.1
+pyrsistent==0.19.3
+python-dateutil==2.8.2
+python-docx==0.8.11
+python-magic==0.4.27
+python-multipart==0.0.6
+python-pptx==0.6.21
+pytz==2023.3
+PyYAML==6.0
+regex==2023.3.23
+requests==2.28.2
+rfc3986==1.5.0
+rich==13.0.1
+semantic-version==2.10.0
+six==1.16.0
+sniffio==1.3.0
+soupsieve==2.4
+SQLAlchemy==1.4.47
+starlette==0.26.1
+tenacity==8.2.2
+tiktoken==0.3.3
+toolz==0.12.0
+tqdm==4.65.0
+typing-inspect==0.8.0
+typing_extensions==4.5.0
+uc-micro-py==1.0.1
+unstructured==0.5.11
+urllib3==1.26.15
+uvicorn==0.21.1
+websockets==11.0.1
+wrapt==1.14.1
+XlsxWriter==3.0.9
+yarl==1.8.2
diff --git a/run.py b/run.py
@@ -0,0 +1,38 @@
+"""
+File Name: run.py
+Author: Faiz A. Farooqui (github.com/faizahmedfarooqui)
+"""
+
+from llama_index import GPTSimpleVectorIndex
+from llama_index.optimization.optimizer import SentenceEmbeddingOptimizer
+
+import sys
+
+EMPTY_RESPONSE = 'Empty Response'
+
+def chatbot(input_text, max_length=1000):
+	index = GPTSimpleVectorIndex.load_from_disk('openai/index.json')
+	response = ''
+	while len(response) < max_length:
+		query_text = input_text + response
+		response_chunk = index.query(query_text, optimizer=SentenceEmbeddingOptimizer(percentile_cutoff=0.5)).response
+		if response_chunk == EMPTY_RESPONSE:
+			break
+		if not response_chunk:
+			break
+		print("chunk:", response_chunk)
+		response += response_chunk
+	response = response.rstrip()  # remove trailing whitespace
+	while response.endswith(EMPTY_RESPONSE):
+		response = response[:response.rfind(EMPTY_RESPONSE)]
+		response = response.rstrip()  # remove trailing whitespace
+	return response
+
+if __name__ == '__main__':
+	if len(sys.argv) < 2:
+		print("Please provide input text as a command line argument.")
+		exit(1)
+	else:
+		input_text = sys.argv[1]
+		response = chatbot(input_text)
+		print('##START##', response, '##DONE##')