From f4cad805b7c0bff5657e8a96f3df46b7c30bd107 Mon Sep 17 00:00:00 2001
From: Massaki Archambault <marchambault@badjware.dev>
Date: Wed, 5 Feb 2025 23:14:33 -0500
Subject: [PATCH] rag step1

---
 requirements.txt |   4 +-
 work/.gitignore  |   3 ++
 work/.keep       |   0
 work/rag.ipynb   | 123 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 129 insertions(+), 1 deletion(-)
 create mode 100644 work/.gitignore
 delete mode 100644 work/.keep
 create mode 100644 work/rag.ipynb

diff --git a/requirements.txt b/requirements.txt
index 6417135..4ebb695 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,4 +3,6 @@ jupyterlab-lsp
 python-lsp-server[all]
 
 matplotlib
-pandas
\ No newline at end of file
+pandas
+
+GitPython
\ No newline at end of file
diff --git a/work/.gitignore b/work/.gitignore
new file mode 100644
index 0000000..647aa76
--- /dev/null
+++ b/work/.gitignore
@@ -0,0 +1,3 @@
+*
+!.gitignore
+!*.ipynb
\ No newline at end of file
diff --git a/work/.keep b/work/.keep
deleted file mode 100644
index e69de29..0000000
diff --git a/work/rag.ipynb b/work/rag.ipynb
new file mode 100644
index 0000000..3718e06
--- /dev/null
+++ b/work/rag.ipynb
@@ -0,0 +1,123 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook is an exploration of how a RAG could be created. The idea is to create a chatbot who can answer questions based on the documents it has been trained on.\n",
+    "\n",
+    "Requirements are as follow:\n",
+    "* Source documents are in markdown format\n",
+    "* Source documents are stored in a git repository\n",
+    "* Everything needs to be self-hosted\n",
+    "  * An Ollama server is already running locally (https://localhost:11434)\n",
+    "* The interface is unimportant for now\n",
+    "  * Eventually, we want it to be a bot hosted in Teams and/or Discord"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Steps\n",
+    "\n",
+    "### Step 1: Fetch the documents from the git repository\n",
+    "\n",
+    "We will use `gitpython` to clone the repository and fetch the documents.\n",
+    "\n",
+    "For this notebook, we will ingest the documentation of prometheus-operator located in this repository: https://github.com/prometheus-operator/prometheus-operator.git. The documentation is within the *Documentation/* folder of this repository.\n",
+    "\n",
+    "First thing first, we need to clone the repository. If it already exists locally, we can just pull the latest changes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "from git import Repo\n",
+    "\n",
+    "repo_url = \"https://github.com/prometheus-operator/prometheus-operator.git\"\n",
+    "local_repo_path = \"./prometheus-operator\"\n",
+    "\n",
+    "if not os.path.exists(local_repo_path):\n",
+    "    Repo.clone_from(repo_url, local_repo_path)\n",
+    "else:\n",
+    "    repo = Repo(local_repo_path)\n",
+    "    repo.remotes.origin.pull()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now that we have a local copy of the repository, we can find the documents we are interested in within and list them."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found 40 documents.\n"
+     ]
+    }
+   ],
+   "source": [
+    "documentation_root = os.path.join(local_repo_path, \"Documentation\")\n",
+    "documentation_files = []\n",
+    "\n",
+    "# Walk through the directory and find all markdown files\n",
+    "for root, dirs, files in os.walk(documentation_root):\n",
+    "    for file in files:\n",
+    "        if file.endswith(\".md\"):\n",
+    "            documentation_files.append(os.path.join(root, file))\n",
+    "\n",
+    "print(f\"Found {len(documentation_files)} documents.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 2: Ingest the documents in a vector database\n",
+    "\n",
+    "To build our RAG, we need to store the documents in a vector database. Several options are available:\n",
+    "* [FAISS](https://faiss.ai/)\n",
+    "* [ChromaDB](https://www.trychroma.com/)\n",
+    "* [Qdrant](https://qdrant.tech/)\n",
+    "* etc.\n",
+    "\n",
+    "\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}