From 2cd366fcff4b1fb91a0252e62fdaa9f2babab06c Mon Sep 17 00:00:00 2001 From: Neha Pullabhotla Date: Thu, 4 Apr 2024 10:54:35 -0700 Subject: [PATCH 1/5] Hi Mando, here is my implementation of the RAG exercise! --- neha_pullabhotla_rag_exercise.ipynb | 9140 +++++++++++++++++++++++++++ 1 file changed, 9140 insertions(+) create mode 100644 neha_pullabhotla_rag_exercise.ipynb diff --git a/neha_pullabhotla_rag_exercise.ipynb b/neha_pullabhotla_rag_exercise.ipynb new file mode 100644 index 0000000..1bbe174 --- /dev/null +++ b/neha_pullabhotla_rag_exercise.ipynb @@ -0,0 +1,9140 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-quxLKdytLor" + }, + "source": [ + "# Welcome to our ML project!\n", + "\n", + "This is a quick exercise to help demonstrate your familiarity with RAG systems - one might say that this is a place where you can b**RAG** about your skills! 🤣\n", + "\n", + "In this exercise, you will be asked to build a simple RAG system that answer some provided questions using the dataset provided. We expect this exercise to take 1-3 hours TOPS so use that to temper your approach to building this. We're not looking for reusable or production-level code - we're expressly looking for you to show us that you:\n", + "\n", + "* can explore an unknown dataset\n", + "* can use an LLM (in this case, OpenAI's GPT-3) to build a simple RAG system\n", + "\n", + "## The Dataset\n", + "\n", + "You'll find the dataset in `content.csv`. It is a set of content about companies that has been scraped from the web. It contains the following columns:\n", + "\n", + "* `company_id`: a unique identifier for the company (UUID)\n", + "* `company_name`: the name of the company\n", + "* `url`: the URL from which the content was scraped\n", + "* `chunk`: a chunk of the content that was scraped from the `url`\n", + "* `chunk_hash`: a hash of the chunk\n", + "* `chunk_id`: a unique identifier for the chunk of content\n", + "* `chunk_type`: the type of the chunk of content (e.g. `header`, `footer`)\n", + "\n", + "\n", + "Here's an example:\n", + "\n", + "|company_id|company_name|url|chunk_type|chunk_hash|chunk|chunk_id|\n", + "|---|---|---|---|---|---|---|\n", + "|4c1fde18-8a40-4ee7-9c3c-19152c7d1ff8|Aboitiz Group|https://aboitiz.com/about-us/the-aboitiz-way/|head|d312f0c688076be80ee2e4af8a51c2f10cbb993a4a8de779cb4aa5545fe1051f|\"Aboitiz - The Aboitiz Way\"|be36e2f0-cd0b-42eb-b36d-c9403c2428be|\n", + "\n", + "## Step 1: Explore the dataset\n", + "\n", + "Here are some questions that we'd like you to answer about the dataset:\n", + "\n", + "1. How many companies are in the dataset?\n", + "2. How many unique URLs are in the dataset?\n", + "3. What is the most common chunk type?\n", + "4. What is the distribution of chunk types by company?" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Ke-HCpHPtLot", + "outputId": "0c2948c3-af12-4f37-ff39-27bf4daccaf8" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (2.0.3)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2023.4)\n", + "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2024.1)\n", + "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas) (1.25.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n", + "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (3.7.1)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (1.2.0)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (0.12.1)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (4.50.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (1.4.5)\n", + "Requirement already satisfied: numpy>=1.20 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (1.25.2)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (24.0)\n", + "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (9.4.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (3.1.2)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (2.8.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n", + "Collecting openai\n", + " Downloading openai-1.16.1-py3-none-any.whl (266 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m266.9/266.9 kB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from openai) (3.7.1)\n", + "Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from openai) (1.7.0)\n", + "Collecting httpx<1,>=0.23.0 (from openai)\n", + " Downloading httpx-0.27.0-py3-none-any.whl (75 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.6/75.6 kB\u001b[0m \u001b[31m10.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: pydantic<3,>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from openai) (2.6.4)\n", + "Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from openai) (1.3.1)\n", + "Requirement already satisfied: tqdm>4 in /usr/local/lib/python3.10/dist-packages (from openai) (4.66.2)\n", + "Requirement already satisfied: typing-extensions<5,>=4.7 in /usr/local/lib/python3.10/dist-packages (from openai) (4.10.0)\n", + "Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai) (3.6)\n", + "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai) (1.2.0)\n", + "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->openai) (2024.2.2)\n", + "Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)\n", + " Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m8.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)\n", + " Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1.9.0->openai) (0.6.0)\n", + "Requirement already satisfied: pydantic-core==2.16.3 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1.9.0->openai) (2.16.3)\n", + "Installing collected packages: h11, httpcore, httpx, openai\n", + "Successfully installed h11-0.14.0 httpcore-1.0.5 httpx-0.27.0 openai-1.16.1\n", + "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.38.2)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.13.3)\n", + "Requirement already satisfied: huggingface-hub<1.0,>=0.19.3 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.20.3)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.25.2)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (24.0)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2023.12.25)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n", + "Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.15.2)\n", + "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.2)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.2)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.19.3->transformers) (2023.6.0)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.19.3->transformers) (4.10.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.6)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2024.2.2)\n", + "Collecting sentence-transformers\n", + " Downloading sentence_transformers-2.6.1-py3-none-any.whl (163 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m163.3/163.3 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: transformers<5.0.0,>=4.32.0 in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (4.38.2)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (4.66.2)\n", + "Requirement already satisfied: torch>=1.11.0 in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (2.2.1+cu121)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.25.2)\n", + "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.2.2)\n", + "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.11.4)\n", + "Requirement already satisfied: huggingface-hub>=0.15.1 in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (0.20.3)\n", + "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (9.4.0)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.15.1->sentence-transformers) (3.13.3)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.15.1->sentence-transformers) (2023.6.0)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.15.1->sentence-transformers) (2.31.0)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.15.1->sentence-transformers) (6.0.1)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.15.1->sentence-transformers) (4.10.0)\n", + "Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.15.1->sentence-transformers) (24.0)\n", + "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.11.0->sentence-transformers) (1.12)\n", + "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.11.0->sentence-transformers) (3.2.1)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.11.0->sentence-transformers) (3.1.3)\n", + "Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)\n", + " Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m23.7/23.7 MB\u001b[0m \u001b[31m44.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)\n", + " Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m823.6/823.6 kB\u001b[0m \u001b[31m50.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)\n", + " Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.1/14.1 MB\u001b[0m \u001b[31m63.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)\n", + " Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m731.7/731.7 MB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transformers)\n", + " Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m410.6/410.6 MB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.11.0->sentence-transformers)\n", + " Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m121.6/121.6 MB\u001b[0m \u001b[31m9.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-curand-cu12==10.3.2.106 (from torch>=1.11.0->sentence-transformers)\n", + " Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.5/56.5 MB\u001b[0m \u001b[31m15.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cusolver-cu12==11.4.5.107 (from torch>=1.11.0->sentence-transformers)\n", + " Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m124.2/124.2 MB\u001b[0m \u001b[31m8.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cusparse-cu12==12.1.0.106 (from torch>=1.11.0->sentence-transformers)\n", + " Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m196.0/196.0 MB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-nccl-cu12==2.19.3 (from torch>=1.11.0->sentence-transformers)\n", + " Downloading nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl (166.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m166.0/166.0 MB\u001b[0m \u001b[31m6.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-nvtx-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)\n", + " Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m99.1/99.1 kB\u001b[0m \u001b[31m12.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: triton==2.2.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.11.0->sentence-transformers) (2.2.0)\n", + "Collecting nvidia-nvjitlink-cu12 (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.11.0->sentence-transformers)\n", + " Downloading nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (21.1 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m84.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.32.0->sentence-transformers) (2023.12.25)\n", + "Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.32.0->sentence-transformers) (0.15.2)\n", + "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.32.0->sentence-transformers) (0.4.2)\n", + "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->sentence-transformers) (1.3.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->sentence-transformers) (3.4.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.11.0->sentence-transformers) (2.1.5)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.15.1->sentence-transformers) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.15.1->sentence-transformers) (3.6)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.15.1->sentence-transformers) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.15.1->sentence-transformers) (2024.2.2)\n", + "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.11.0->sentence-transformers) (1.3.0)\n", + "Installing collected packages: nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, nvidia-cusparse-cu12, nvidia-cudnn-cu12, nvidia-cusolver-cu12, sentence-transformers\n", + "Successfully installed nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.19.3 nvidia-nvjitlink-cu12-12.4.127 nvidia-nvtx-cu12-12.1.105 sentence-transformers-2.6.1\n", + "Collecting spacy-langdetect\n", + " Downloading spacy_langdetect-0.1.2-py3-none-any.whl (5.0 kB)\n", + "Requirement already satisfied: pytest in /usr/local/lib/python3.10/dist-packages (from spacy-langdetect) (7.4.4)\n", + "Collecting langdetect==1.0.7 (from spacy-langdetect)\n", + " Downloading langdetect-1.0.7.zip (998 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m998.1/998.1 kB\u001b[0m \u001b[31m22.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from langdetect==1.0.7->spacy-langdetect) (1.16.0)\n", + "Requirement already satisfied: iniconfig in /usr/local/lib/python3.10/dist-packages (from pytest->spacy-langdetect) (2.0.0)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from pytest->spacy-langdetect) (24.0)\n", + "Requirement already satisfied: pluggy<2.0,>=0.12 in /usr/local/lib/python3.10/dist-packages (from pytest->spacy-langdetect) (1.4.0)\n", + "Requirement already satisfied: exceptiongroup>=1.0.0rc8 in /usr/local/lib/python3.10/dist-packages (from pytest->spacy-langdetect) (1.2.0)\n", + "Requirement already satisfied: tomli>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from pytest->spacy-langdetect) (2.0.1)\n", + "Building wheels for collected packages: langdetect\n", + " Building wheel for langdetect (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for langdetect: filename=langdetect-1.0.7-py3-none-any.whl size=993414 sha256=a75e1e0316355fc6372275034a21cb35434fac0b156e02fde4c0d5f1e809fc8a\n", + " Stored in directory: /root/.cache/pip/wheels/97/f1/e4/8b73f7a0421b132755956892d29b1e764d3e0857a6e92e32fe\n", + "Successfully built langdetect\n", + "Installing collected packages: langdetect, spacy-langdetect\n", + "Successfully installed langdetect-1.0.7 spacy-langdetect-0.1.2\n" + ] + } + ], + "source": [ + "%pip install pandas\n", + "%pip install matplotlib\n", + "%pip install openai\n", + "%pip install transformers\n", + "%pip install sentence-transformers\n", + "%pip install spacy-langdetect" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "n0wTjYjbtLov" + }, + "source": [ + "## Step 2: RAGtime!\n", + "\n", + "Now that you're a little more familar with the dataset, let's build a simple RAG system that uses OpenAI to help answer some questions about the dataset. To reiterate, we don't expect you to add anything else to the environment to build this system - for example, you don't need to set up a database or anything like that. You can add any libraries you need to the environment, but we'd like you to use OpenAI for any and all tasks that require a language model (we'll send you a key to use).\n", + "\n", + "That being said, we'd like you to show the specifics of how a RAG implementation works so please avoid using any libraries that provide end-to-end RAG implementations.\n", + "\n", + "Here is the question that we'd like you to answer via your RAG system:\n", + "\n", + "1. What does the company Caravan Health do?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d91PeIlFBm3u" + }, + "source": [ + "### Step 1: Data Exploration" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Import Libraries" + ], + "metadata": { + "id": "ZDHrHDRN7Ihd" + } + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": { + "id": "sygX0Vn9tLov" + }, + "outputs": [], + "source": [ + "import os\n", + "import pickle\n", + "import pandas as pd\n", + "import numpy as np\n", + "from matplotlib import pyplot as plt\n", + "import seaborn as sns\n", + "from collections import defaultdict\n", + "import json\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "id": "DJkDRlIUtLo0", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "9bf54f5e-319a-4dd9-f3c7-70c1306661b9" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package punkt to /root/nltk_data...\n", + "[nltk_data] Unzipping tokenizers/punkt.zip.\n", + "[nltk_data] Downloading package words to /root/nltk_data...\n", + "[nltk_data] Unzipping corpora/words.zip.\n" + ] + } + ], + "source": [ + "import nltk\n", + "import spacy\n", + "nltk.download('punkt')\n", + "nltk.download('words')\n", + "from nltk.tokenize import sent_tokenize, word_tokenize\n", + "from spacy_langdetect import LanguageDetector\n", + "from spacy.language import Language" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "0yx_jFPNAb5E", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "da8d34fa-e645-433d-9508-e2f348403770" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/drive\n" + ] + } + ], + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ] + }, + { + "cell_type": "markdown", + "source": [ + "#### Data Processing" + ], + "metadata": { + "id": "RfmuQETN7QwR" + } + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "VRhU5aMwtLov", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 486 + }, + "outputId": "22eec512-1cf6-44c5-b6f2-90f24deea953" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " company_id company_name \\\n", + "0 4c1fde18-8a40-4ee7-9c3c-19152c7d1ff8 Aboitiz Group \n", + "1 4c1fde18-8a40-4ee7-9c3c-19152c7d1ff8 Aboitiz Group \n", + "2 4c1fde18-8a40-4ee7-9c3c-19152c7d1ff8 Aboitiz Group \n", + "3 4c1fde18-8a40-4ee7-9c3c-19152c7d1ff8 Aboitiz Group \n", + "4 47a9fbaa-0c12-442e-b030-78d3ee7b15d3 Bespoke \n", + "\n", + " url chunk_type \\\n", + "0 https://aboitiz.com/about-us/the-aboitiz-way/ head \n", + "1 https://aboitiz.com/about-us/the-aboitiz-way/ header \n", + "2 https://aboitiz.com/about-us/the-aboitiz-way/ footer \n", + "3 https://aboitiz.com/about-us/the-aboitiz-way/ main \n", + "4 https://www.be-spoke.io/privacy-policy head \n", + "\n", + " chunk_hash \\\n", + "0 d312f0c688076be80ee2e4af8a51c2f10cbb993a4a8de7... \n", + "1 2c5534296a5e671efb47ccd627d797ec4dbe53dd393c98... \n", + "2 0bb3d3e6ca11963bcaeeacb07c26a781bc290c878cba57... \n", + "3 19800e7fa254fdaa8ce593a66961c1ba92b0bd3245c3d9... \n", + "4 3f28494cceaeb2ab4060b91d19046b3760de41021fa0fb... \n", + "\n", + " chunk \\\n", + "0 Aboitiz | ... \n", + "1 <header class=\"scroll_header_top_area stick sc... \n", + "2 <footer><ul><li><a href=\"https://aboitiz.com/\"... \n", + "3 <body class=\"page-template page-template-child... \n", + "4 <head><meta charset=\"utf-8\"/><title>プライバシーポリシー... \n", + "\n", + " chunk_id \n", + "0 be36e2f0-cd0b-42eb-b36d-c9403c2428be \n", + "1 bfba99bd-49d1-43b1-9728-5c04edd151a4 \n", + "2 7cc48405-f7ef-45a8-99f2-9f900e930e3d \n", + "3 319ac94f-2547-4d9e-b353-cd7997d44ec6 \n", + "4 37080a59-9d5c-465d-9d59-807ef3cfb435 " + ], + "text/html": [ + "\n", + " <div id=\"df-ea6d535f-eaeb-4c97-934b-c5d1dca0e8be\" class=\"colab-df-container\">\n", + " <div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>company_id</th>\n", + " <th>company_name</th>\n", + " <th>url</th>\n", + " <th>chunk_type</th>\n", + " <th>chunk_hash</th>\n", + " <th>chunk</th>\n", + " <th>chunk_id</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>4c1fde18-8a40-4ee7-9c3c-19152c7d1ff8</td>\n", + " <td>Aboitiz Group</td>\n", + " <td>https://aboitiz.com/about-us/the-aboitiz-way/</td>\n", + " <td>head</td>\n", + " <td>d312f0c688076be80ee2e4af8a51c2f10cbb993a4a8de7...</td>\n", + " <td><head><meta charset=\"utf-8\"/><title>Aboitiz | ...</td>\n", + " <td>be36e2f0-cd0b-42eb-b36d-c9403c2428be</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>4c1fde18-8a40-4ee7-9c3c-19152c7d1ff8</td>\n", + " <td>Aboitiz Group</td>\n", + " <td>https://aboitiz.com/about-us/the-aboitiz-way/</td>\n", + " <td>header</td>\n", + " <td>2c5534296a5e671efb47ccd627d797ec4dbe53dd393c98...</td>\n", + " <td><header class=\"scroll_header_top_area stick sc...</td>\n", + " <td>bfba99bd-49d1-43b1-9728-5c04edd151a4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>4c1fde18-8a40-4ee7-9c3c-19152c7d1ff8</td>\n", + " <td>Aboitiz Group</td>\n", + " <td>https://aboitiz.com/about-us/the-aboitiz-way/</td>\n", + " <td>footer</td>\n", + " <td>0bb3d3e6ca11963bcaeeacb07c26a781bc290c878cba57...</td>\n", + " <td><footer><ul><li><a href=\"https://aboitiz.com/\"...</td>\n", + " <td>7cc48405-f7ef-45a8-99f2-9f900e930e3d</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>4c1fde18-8a40-4ee7-9c3c-19152c7d1ff8</td>\n", + " <td>Aboitiz Group</td>\n", + " <td>https://aboitiz.com/about-us/the-aboitiz-way/</td>\n", + " <td>main</td>\n", + " <td>19800e7fa254fdaa8ce593a66961c1ba92b0bd3245c3d9...</td>\n", + " <td><body class=\"page-template page-template-child...</td>\n", + " <td>319ac94f-2547-4d9e-b353-cd7997d44ec6</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>47a9fbaa-0c12-442e-b030-78d3ee7b15d3</td>\n", + " <td>Bespoke</td>\n", + " <td>https://www.be-spoke.io/privacy-policy</td>\n", + " <td>head</td>\n", + " <td>3f28494cceaeb2ab4060b91d19046b3760de41021fa0fb...</td>\n", + " <td><head><meta charset=\"utf-8\"/><title>プライバシーポリシー...</td>\n", + " <td>37080a59-9d5c-465d-9d59-807ef3cfb435</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>\n", + " <div class=\"colab-df-buttons\">\n", + "\n", + " <div class=\"colab-df-container\">\n", + " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-ea6d535f-eaeb-4c97-934b-c5d1dca0e8be')\"\n", + " title=\"Convert this dataframe to an interactive table.\"\n", + " style=\"display:none;\">\n", + "\n", + " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n", + " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n", + " </svg>\n", + " </button>\n", + "\n", + " <style>\n", + " .colab-df-container {\n", + " display:flex;\n", + " gap: 12px;\n", + " }\n", + "\n", + " .colab-df-convert {\n", + " background-color: #E8F0FE;\n", + " border: none;\n", + " border-radius: 50%;\n", + " cursor: pointer;\n", + " display: none;\n", + " fill: #1967D2;\n", + " height: 32px;\n", + " padding: 0 0 0 0;\n", + " width: 32px;\n", + " }\n", + "\n", + " .colab-df-convert:hover {\n", + " background-color: #E2EBFA;\n", + " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", + " fill: #174EA6;\n", + " }\n", + "\n", + " .colab-df-buttons div {\n", + " margin-bottom: 4px;\n", + " }\n", + "\n", + " [theme=dark] .colab-df-convert {\n", + " background-color: #3B4455;\n", + " fill: #D2E3FC;\n", + " }\n", + "\n", + " [theme=dark] .colab-df-convert:hover {\n", + " background-color: #434B5C;\n", + " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", + " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", + " fill: #FFFFFF;\n", + " }\n", + " </style>\n", + "\n", + " <script>\n", + " const buttonEl =\n", + " document.querySelector('#df-ea6d535f-eaeb-4c97-934b-c5d1dca0e8be button.colab-df-convert');\n", + " buttonEl.style.display =\n", + " google.colab.kernel.accessAllowed ? 'block' : 'none';\n", + "\n", + " async function convertToInteractive(key) {\n", + " const element = document.querySelector('#df-ea6d535f-eaeb-4c97-934b-c5d1dca0e8be');\n", + " const dataTable =\n", + " await google.colab.kernel.invokeFunction('convertToInteractive',\n", + " [key], {});\n", + " if (!dataTable) return;\n", + "\n", + " const docLinkHtml = 'Like what you see? Visit the ' +\n", + " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", + " + ' to learn more about interactive tables.';\n", + " element.innerHTML = '';\n", + " dataTable['output_type'] = 'display_data';\n", + " await google.colab.output.renderOutput(dataTable, element);\n", + " const docLink = document.createElement('div');\n", + " docLink.innerHTML = docLinkHtml;\n", + " element.appendChild(docLink);\n", + " }\n", + " </script>\n", + " </div>\n", + "\n", + "\n", + "<div id=\"df-e0573b55-bd55-49c9-89d4-3265f8839183\">\n", + " <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-e0573b55-bd55-49c9-89d4-3265f8839183')\"\n", + " title=\"Suggest charts\"\n", + " style=\"display:none;\">\n", + "\n", + "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", + " width=\"24px\">\n", + " <g>\n", + " <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n", + " </g>\n", + "</svg>\n", + " </button>\n", + "\n", + "<style>\n", + " .colab-df-quickchart {\n", + " --bg-color: #E8F0FE;\n", + " --fill-color: #1967D2;\n", + " --hover-bg-color: #E2EBFA;\n", + " --hover-fill-color: #174EA6;\n", + " --disabled-fill-color: #AAA;\n", + " --disabled-bg-color: #DDD;\n", + " }\n", + "\n", + " [theme=dark] .colab-df-quickchart {\n", + " --bg-color: #3B4455;\n", + " --fill-color: #D2E3FC;\n", + " --hover-bg-color: #434B5C;\n", + " --hover-fill-color: #FFFFFF;\n", + " --disabled-bg-color: #3B4455;\n", + " --disabled-fill-color: #666;\n", + " }\n", + "\n", + " .colab-df-quickchart {\n", + " background-color: var(--bg-color);\n", + " border: none;\n", + " border-radius: 50%;\n", + " cursor: pointer;\n", + " display: none;\n", + " fill: var(--fill-color);\n", + " height: 32px;\n", + " padding: 0;\n", + " width: 32px;\n", + " }\n", + "\n", + " .colab-df-quickchart:hover {\n", + " background-color: var(--hover-bg-color);\n", + " box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n", + " fill: var(--button-hover-fill-color);\n", + " }\n", + "\n", + " .colab-df-quickchart-complete:disabled,\n", + " .colab-df-quickchart-complete:disabled:hover {\n", + " background-color: var(--disabled-bg-color);\n", + " fill: var(--disabled-fill-color);\n", + " box-shadow: none;\n", + " }\n", + "\n", + " .colab-df-spinner {\n", + " border: 2px solid var(--fill-color);\n", + " border-color: transparent;\n", + " border-bottom-color: var(--fill-color);\n", + " animation:\n", + " spin 1s steps(1) infinite;\n", + " }\n", + "\n", + " @keyframes spin {\n", + " 0% {\n", + " border-color: transparent;\n", + " border-bottom-color: var(--fill-color);\n", + " border-left-color: var(--fill-color);\n", + " }\n", + " 20% {\n", + " border-color: transparent;\n", + " border-left-color: var(--fill-color);\n", + " border-top-color: var(--fill-color);\n", + " }\n", + " 30% {\n", + " border-color: transparent;\n", + " border-left-color: var(--fill-color);\n", + " border-top-color: var(--fill-color);\n", + " border-right-color: var(--fill-color);\n", + " }\n", + " 40% {\n", + " border-color: transparent;\n", + " border-right-color: var(--fill-color);\n", + " border-top-color: var(--fill-color);\n", + " }\n", + " 60% {\n", + " border-color: transparent;\n", + " border-right-color: var(--fill-color);\n", + " }\n", + " 80% {\n", + " border-color: transparent;\n", + " border-right-color: var(--fill-color);\n", + " border-bottom-color: var(--fill-color);\n", + " }\n", + " 90% {\n", + " border-color: transparent;\n", + " border-bottom-color: var(--fill-color);\n", + " }\n", + " }\n", + "</style>\n", + "\n", + " <script>\n", + " async function quickchart(key) {\n", + " const quickchartButtonEl =\n", + " document.querySelector('#' + key + ' button');\n", + " quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n", + " quickchartButtonEl.classList.add('colab-df-spinner');\n", + " try {\n", + " const charts = await google.colab.kernel.invokeFunction(\n", + " 'suggestCharts', [key], {});\n", + " } catch (error) {\n", + " console.error('Error during call to suggestCharts:', error);\n", + " }\n", + " quickchartButtonEl.classList.remove('colab-df-spinner');\n", + " quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n", + " }\n", + " (() => {\n", + " let quickchartButtonEl =\n", + " document.querySelector('#df-e0573b55-bd55-49c9-89d4-3265f8839183 button');\n", + " quickchartButtonEl.style.display =\n", + " google.colab.kernel.accessAllowed ? 'block' : 'none';\n", + " })();\n", + " </script>\n", + "</div>\n", + "\n", + " </div>\n", + " </div>\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "data", + "summary": "{\n \"name\": \"data\",\n \"rows\": 2128,\n \"fields\": [\n {\n \"column\": \"company_id\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 75,\n \"samples\": [\n \"a45e4bd7-0ea7-4c3f-ba6a-cdc5027f6c60\",\n \"a07fb185-9f97-498c-bcda-a4c21fe27467\",\n \"aab2261b-7065-460d-83de-99404fb50f65\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"company_name\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 75,\n \"samples\": [\n \"bingotech.net\",\n \"Universit\\u00e0 degli Studi di Milano\",\n \"betahaus Sofia\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"url\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 530,\n \"samples\": [\n \"https://www.actionvfx.com/\",\n \"https://achiredo.com/industrial-and-enterprise-solutions.html\",\n \"https://www.accel.aero/legal/\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"chunk_type\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"header\",\n \"main\",\n \"head\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"chunk_hash\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1337,\n \"samples\": [\n \"0ed5f2c605680dcee194a1dac8e889f3bdf4fe31ad88555006de4c8ff02ddfa0\",\n \"06cbc8026aae9d62be16b19eb05610dee7f6f467d98c05ed83c9b1d2b7c178ed\",\n \"4e3aedf926cd7fa2c0cc33e57d11a746a036d0f744ed3355dc2b27216705a245\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"chunk\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1337,\n \"samples\": [\n \"<footer class=\\\"site-footer\\\"><img alt=\\\"ABBE Technology Solutions, Inc.\\\"/>Quick Links<ul itemscope=\\\"\\\" itemtype=\\\"http://www.schema.org/SiteNavigationElement\\\"><li><a href=\\\"https://abbe.com.ph/about-us/\\\">About Us</a></li><li><a href=\\\"https://abbe.com.ph/featuredproducts/\\\">Featured Products</a></li><li><a href=\\\"https://abbe.com.ph/advancement/\\\">Advancement</a></li><li><a href=\\\"https://abbe.com.ph/news/\\\">Blogs</a></li><li><a href=\\\"https://abbe.com.ph/inquiry/\\\">Inquiry</a></li><li><a href=\\\"https://abbe.com.ph/privacy-policy/\\\">Privacy Policy</a></li></ul>Follow Us<section><ul><li><a href=\\\"https://web.facebook.com/ABBETechnologySolutionsInc/\\\">ABBE Technology Solutions, Inc.</a></li><li><a href=\\\"https://www.linkedin.com/company/71672056\\\">ABBE Technology</a></li><li><a href=\\\"https://twitter.com/ABBETechnology\\\">ABBE Technology Solutions, Inc.</a></li></ul></section>Contact Us<ul><li>+632 - 8892-4927</li><li><a>+63-9616034964</a></li><li><a href=\\\"/cdn-cgi/l/email-protection#10797e767f50717272753e737f7d3e6078\\\">[email protected]</a></li></ul>Address GC Corporate Plaza 150 Legaspi Street Legazpi Village, Makati City Metro Manila, Philippines<p>\\u00a9 2024 ABBE Technology Solutions, Inc. All Rights Reserved.</p></footer>\",\n \"<head><meta charset=\\\"utf-8\\\"/><meta content=\\\"width=device-width, initial-scale=1.0\\\" name=\\\"viewport\\\"/><meta content=\\\"index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1\\\" name=\\\"robots\\\"/><title>Home Alarm App | Home Security App By Alarm.com\",\n \"

Contact Us

\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"chunk_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2128,\n \"samples\": [\n \"c6f08030-5a79-4efa-ad80-345101605ee5\",\n \"57e32f4e-5ae1-42ec-85f9-67f13c688e54\",\n \"7f6c5256-0c86-4115-821e-9191cd383d39\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 6 + } + ], + "source": [ + "# Read the data to a pandas dataframe\n", + "data = pd.read_csv('/content/drive/MyDrive/exercises/machine-learning/content.csv')\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "yRjRNoq_tLow", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 246 + }, + "outputId": "08f04679-43af-4afe-8848-65b78c1d17dd" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " company_id company_name \\\n", + "count 2128 2128 \n", + "unique 75 75 \n", + "top 1d793954-ad6e-4d64-84d6-965b6d7e6e28 Abelini \n", + "freq 55 55 \n", + "\n", + " url chunk_type \\\n", + "count 2128 2128 \n", + "unique 530 4 \n", + "top https://www.westdermatology.com/ header \n", + "freq 10 549 \n", + "\n", + " chunk_hash \\\n", + "count 2128 \n", + "unique 1337 \n", + "top fa4f773a4d750c100202d812c3b71cf999a8472b14cebc... \n", + "freq 11 \n", + "\n", + " chunk \\\n", + "count 2128 \n", + "unique 1337 \n", + "top