diff --git a/.github/workflows/ipynb_tests.yml b/.github/workflows/ipynb_tests.yml index 1b295a0..f78a6e1 100644 --- a/.github/workflows/ipynb_tests.yml +++ b/.github/workflows/ipynb_tests.yml @@ -1,23 +1,29 @@ name: Notebook tests -on: [push] +on: [push, pull_request] jobs: - build: - + test-notebooks: runs-on: ubuntu-latest + # Test on multiple Python versions + strategy: + matrix: + python-version: ["3.9", "3.10", "3.11"] + steps: - - uses: actions/checkout@v1 - - name: Set up Python 3.7 - uses: actions/setup-python@v1 + - name: Checkout repository + uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 with: - python-version: 3.7 + python-version: ${{ matrix.python-version }} + # Caching for faster dependency installation on subsequent runs + cache: 'pip' - name: Install dependencies run: | python -m pip install --upgrade pip pip install -r requirements.txt - name: Test notebooks with pytest run: | - pip install pytest nbval pytest --nbval diff --git a/OOP-design/designing_amazon_online_shopping_system.md b/OOP-design/designing_amazon_online_shopping_system.md new file mode 100644 index 0000000..ff9d6b9 --- /dev/null +++ b/OOP-design/designing_amazon_online_shopping_system.md @@ -0,0 +1,285 @@ +# Designing Amazon - Online Shopping System +Let's design an online retail store. +For the sake of this problem, we'll focus on Amazon's retail business where users can buy/sell products. + + +## Requirements and Goals of the System +1. Users should be able to: + - Add new products to sell. + - Search products by name or category. + - Buy products only if they are registered members. + - Remove/modify product items in their shopping cart. + - Checkout and buy items in the shopping cart. + - Rate and review a product. + - Specify a shipping address where their order will be delivered. + - Cancel and order if it hasn't been shipped. + - Pay via debit or credit cards + - Track their shipment to see the current state of their order. +2. The system should be able to: + - Send a notification whenever the shipping status of the order changes. + +## Use Case Diagram +We have four main actors in the system: + +- **Admin:** Mainly responsible for account management, adding and modifying new product categories. + +- **Guest:** All guests can search the catalog, add/remove items on the shopping cart, and also become registered members. +- **Member:** In addition to what guests can do, members can place orders and add new products to sell +- **System:** Mainly responsible for sending notifications for orders and shipping updates. + + +Top use cases therefore include: +1. Add/Update products: whenever a product is added/modified, update the catalog. +2. Search for products by their name or category. +3. Add/remove product items from shopping cart. +4. Checkout to buy a product item in the shopping cart. +5. Make a payment to place an order. +6. Add a new product category. +7. Send notifications about order shipment updates to members. + + +### Code + +First we define the enums, datatypes and constants that'll be used by the rest of the classes: + + +```python +from enum import Enum + + +class AccountStatus(Enum): + ACTIVE, BLOCKED, BANNED, COMPROMISED, ARCHIVED, UNKNOWN = 1, 2, 3, 4, 5, 6 + +class OrderStatus(Enum): + UNSHIPPED, SHIPPED, PENDING, COMPLETED, CANCELED, REFUND_APPLIED = 1, 2, 3, 4, 5, 6 + +class ShipmentStatus(Enum): + PENDING, SHIPPED, DELIVERED, ON_HOLD = 1, 2, 3, 4 + +class PaymentStatus(Enum): + UNPAID, PENDING, COMPLETED, FILLED, DECLINED = 1, 2, 3, 4, 5 + CANCELLED, ABANDONED, SETTLING, SETTLED, REFUNDED = 6, 7, 8, 9, 10 + +``` + +#### Account, Customer, Admin and Guest classes +These classes represent different people that interact with the system. + + +```python +from abc import ABC, abstractmethod + + +class Account: + """Python strives to adhere to Uniform Access Principle. + + So there's no need for getter and setter methods. + """ + + def __init__(self, username, password, name, email, phone, shipping_address, status:AccountStatus): + # "private" attributes + self._username = username + self._password = password + self._email = email + self._phone = phone + self._shipping_address = shipping_address + self._status = status.ACTIVE + self._credit_cards = [] + self._bank_accounts = [] + + def add_product(self, product): + pass + + def add_product_review(self, review): + pass + + def reset_password(self): + pass + + +class Customer(ABC): + def __init__(self, cart, order): + self._cart = cart + self._order = order + + def get_shopping_cart(self): + return self.cart + + def add_item_to_cart(self, item): + raise NotImplementedError + + def remove_item_from_cart(self, item): + raise NotImplementedError + + +class Guest(Customer): + def register_account(self): + pass + + +class Member(Customer): + def __init__(self, account:Account): + self._account = account + + def place_order(self, order): + pass + +``` + + +```python +# Test class definition +g = Guest(cart="Cart1", order="Order1") +print(hasattr(g, "remove_item_from_cart")) +print(isinstance(g, Customer)) +``` + + True + True + + +#### Product Category, Product and Product Review +The classes below are related to a product: + + +```python +class Product: + def __init__(self, product_id, name, description, price, category, available_item_count): + self._product_id = product_id + self._name = name + self._price = price + self._category = category + self._available_item_count = 0 + + def update_price(self, new_price): + self._price = new_price + + +class ProductCategory: + def __init__(self, name, description): + self._name = name + self._description = description + + +class ProductReview: + def __init__(self, rating, review, reviewer): + self._rating = rating + self._review = review + self._reviewer = reviewer + +``` + +#### ShoppingCart, Item, Order and OrderLog +Users will add items to the shopping cart and place an order to buy all the items in the cart. + + +```python +class Item: + def __init__(self, item_id, quantity, price): + self._item_id = item_id + self._quantity = quantity + self._price = price + + def update_quantity(self, quantity): + self._quantity = quantity + + def __repr__(self): + return f"ItemID:<{self._item_id}>" + + +class ShoppingCart: + """We can still access items by calling items instead of having getter method + """ + def __init__(self): + self._items = [] + + def add_item(self, item): + self._items.append(item) + + def remove_item(self, item): + self._items.remove(item) + + def update_item_quantity(self, item, quantity): + pass + +``` + + +```python +item = Item(item_id=1, quantity=2, price=300) +cart = ShoppingCart() +cart.add_item(item) +``` + + +```python +# shopping cart now has items +cart._items +``` + + + + + [ItemID:<1>] + + + + +```python +import datetime + + +class OrderLog: + def __init__(self, order_number, status=OrderStatus.PENDING): + self._order_number = order_number + self._creation_date = datetime.date.today() + self._status = status + + +class Order: + def __init__(self, order_number, status=OrderStatus.PENDING): + self._order_number = order_number + self._status = status + self._order_date = datetime.date.today() + self._order_log = [] + + def send_for_shipment(self): + pass + + def make_payment(self, payment): + pass + + def add_order_log(self, order_log): + pass +``` + +#### Shipment and Notification +After successfully placing an order and processing the payment, a shipment record will be created. +Let's define the Shipment and Notification classes: + + +```python +import datetime + + +class ShipmentLog: + def __init__(self, shipment_id, status=ShipmentStatus.PENDING): + self._shipment_id = shipment_id + self.shipment_status = status + + +class Shipment: + def __init__(self, shipment_id, shipment_method, eta=None, shipment_logs=[]): + self._shipment_id = shipment_id + self._shipment_date = datetime.date.today() + self._eta = eta + self._shipment_logs = shipment_logs + + +class Notification(ABC): + def __init__(self, notification_id, content): + self._notification_id = notification_id + self._created_on = datetime.datetime.now() + self._content = content + +``` diff --git a/OOP-design/executed_amazon_system.ipynb b/OOP-design/executed_amazon_system.ipynb new file mode 100644 index 0000000..3abe9d2 --- /dev/null +++ b/OOP-design/executed_amazon_system.ipynb @@ -0,0 +1,451 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Designing Amazon - Online Shopping System\n", + "Let's design an online retail store.\n", + "For the sake of this problem, we'll focus on Amazon's retail business where users can buy/sell products.\n", + "\n", + "\n", + "## Requirements and Goals of the System\n", + "1. Users should be able to:\n", + " - Add new products to sell.\n", + " - Search products by name or category.\n", + " - Buy products only if they are registered members.\n", + " - Remove/modify product items in their shopping cart.\n", + " - Checkout and buy items in the shopping cart.\n", + " - Rate and review a product.\n", + " - Specify a shipping address where their order will be delivered.\n", + " - Cancel and order if it hasn't been shipped.\n", + " - Pay via debit or credit cards\n", + " - Track their shipment to see the current state of their order.\n", + "2. The system should be able to:\n", + " - Send a notification whenever the shipping status of the order changes." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use Case Diagram\n", + "We have four main actors in the system:\n", + "\n", + "- **Admin:** Mainly responsible for account management, adding and modifying new product categories.\n", + "\n", + "- **Guest:** All guests can search the catalog, add/remove items on the shopping cart, and also become registered members.\n", + "- **Member:** In addition to what guests can do, members can place orders and add new products to sell\n", + "- **System:** Mainly responsible for sending notifications for orders and shipping updates.\n", + "\n", + "\n", + "Top use cases therefore include:\n", + "1. Add/Update products: whenever a product is added/modified, update the catalog.\n", + "2. Search for products by their name or category.\n", + "3. Add/remove product items from shopping cart.\n", + "4. Checkout to buy a product item in the shopping cart.\n", + "5. Make a payment to place an order.\n", + "6. Add a new product category.\n", + "7. Send notifications about order shipment updates to members. \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Code\n", + "\n", + "First we define the enums, datatypes and constants that'll be used by the rest of the classes:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2025-05-21T08:11:04.847393Z", + "iopub.status.busy": "2025-05-21T08:11:04.846994Z", + "iopub.status.idle": "2025-05-21T08:11:04.861135Z", + "shell.execute_reply": "2025-05-21T08:11:04.859926Z" + } + }, + "outputs": [], + "source": [ + "from enum import Enum\n", + "\n", + "\n", + "class AccountStatus(Enum):\n", + " ACTIVE, BLOCKED, BANNED, COMPROMISED, ARCHIVED, UNKNOWN = 1, 2, 3, 4, 5, 6\n", + "\n", + "class OrderStatus(Enum):\n", + " UNSHIPPED, SHIPPED, PENDING, COMPLETED, CANCELED, REFUND_APPLIED = 1, 2, 3, 4, 5, 6\n", + "\n", + "class ShipmentStatus(Enum):\n", + " PENDING, SHIPPED, DELIVERED, ON_HOLD = 1, 2, 3, 4\n", + " \n", + "class PaymentStatus(Enum):\n", + " UNPAID, PENDING, COMPLETED, FILLED, DECLINED = 1, 2, 3, 4, 5\n", + " CANCELLED, ABANDONED, SETTLING, SETTLED, REFUNDED = 6, 7, 8, 9, 10\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Account, Customer, Admin and Guest classes \n", + "These classes represent different people that interact with the system." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "execution": { + "iopub.execute_input": "2025-05-21T08:11:04.865796Z", + "iopub.status.busy": "2025-05-21T08:11:04.865398Z", + "iopub.status.idle": "2025-05-21T08:11:04.875796Z", + "shell.execute_reply": "2025-05-21T08:11:04.874579Z" + } + }, + "outputs": [], + "source": [ + "from abc import ABC, abstractmethod\n", + "\n", + "\n", + "class Account:\n", + " \"\"\"Python strives to adhere to Uniform Access Principle. \n", + " \n", + " So there's no need for getter and setter methods. \n", + " \"\"\"\n", + " \n", + " def __init__(self, username, password, name, email, phone, shipping_address, status:AccountStatus):\n", + " # \"private\" attributes \n", + " self._username = username\n", + " self._password = password\n", + " self._email = email\n", + " self._phone = phone\n", + " self._shipping_address = shipping_address\n", + " self._status = status.ACTIVE\n", + " self._credit_cards = []\n", + " self._bank_accounts = []\n", + " \n", + " def add_product(self, product):\n", + " pass\n", + " \n", + " def add_product_review(self, review):\n", + " pass\n", + " \n", + " def reset_password(self):\n", + " pass\n", + "\n", + "\n", + "class Customer(ABC):\n", + " def __init__(self, cart, order):\n", + " self._cart = cart\n", + " self._order = order\n", + " \n", + " def get_shopping_cart(self):\n", + " return self.cart\n", + " \n", + " def add_item_to_cart(self, item):\n", + " raise NotImplementedError\n", + " \n", + " def remove_item_from_cart(self, item):\n", + " raise NotImplementedError\n", + " \n", + "\n", + "class Guest(Customer):\n", + " def register_account(self):\n", + " pass\n", + "\n", + "\n", + "class Member(Customer):\n", + " def __init__(self, account:Account):\n", + " self._account = account\n", + " \n", + " def place_order(self, order):\n", + " pass\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "execution": { + "iopub.execute_input": "2025-05-21T08:11:04.880825Z", + "iopub.status.busy": "2025-05-21T08:11:04.880407Z", + "iopub.status.idle": "2025-05-21T08:11:04.887244Z", + "shell.execute_reply": "2025-05-21T08:11:04.885771Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n", + "True\n" + ] + } + ], + "source": [ + "# Test class definition\n", + "g = Guest(cart=\"Cart1\", order=\"Order1\")\n", + "print(hasattr(g, \"remove_item_from_cart\"))\n", + "print(isinstance(g, Customer))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Product Category, Product and Product Review\n", + "The classes below are related to a product:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "execution": { + "iopub.execute_input": "2025-05-21T08:11:04.961503Z", + "iopub.status.busy": "2025-05-21T08:11:04.961118Z", + "iopub.status.idle": "2025-05-21T08:11:04.968780Z", + "shell.execute_reply": "2025-05-21T08:11:04.967731Z" + } + }, + "outputs": [], + "source": [ + "class Product:\n", + " def __init__(self, product_id, name, description, price, category, available_item_count):\n", + " self._product_id = product_id\n", + " self._name = name\n", + " self._price = price\n", + " self._category = category\n", + " self._available_item_count = 0\n", + " \n", + " def update_price(self, new_price):\n", + " self._price = new_price\n", + " \n", + " \n", + "class ProductCategory:\n", + " def __init__(self, name, description):\n", + " self._name = name\n", + " self._description = description\n", + " \n", + "\n", + "class ProductReview:\n", + " def __init__(self, rating, review, reviewer):\n", + " self._rating = rating\n", + " self._review = review\n", + " self._reviewer = reviewer\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### ShoppingCart, Item, Order and OrderLog\n", + "Users will add items to the shopping cart and place an order to buy all the items in the cart." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "execution": { + "iopub.execute_input": "2025-05-21T08:11:04.972751Z", + "iopub.status.busy": "2025-05-21T08:11:04.972386Z", + "iopub.status.idle": "2025-05-21T08:11:04.980017Z", + "shell.execute_reply": "2025-05-21T08:11:04.978809Z" + } + }, + "outputs": [], + "source": [ + "class Item:\n", + " def __init__(self, item_id, quantity, price):\n", + " self._item_id = item_id\n", + " self._quantity = quantity\n", + " self._price = price\n", + " \n", + " def update_quantity(self, quantity):\n", + " self._quantity = quantity\n", + " \n", + " def __repr__(self):\n", + " return f\"ItemID:<{self._item_id}>\" \n", + "\n", + "\n", + "class ShoppingCart:\n", + " \"\"\"We can still access items by calling items instead of having getter method\n", + " \"\"\"\n", + " def __init__(self):\n", + " self._items = []\n", + " \n", + " def add_item(self, item):\n", + " self._items.append(item)\n", + " \n", + " def remove_item(self, item):\n", + " self._items.remove(item)\n", + " \n", + " def update_item_quantity(self, item, quantity):\n", + " pass\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "execution": { + "iopub.execute_input": "2025-05-21T08:11:04.984265Z", + "iopub.status.busy": "2025-05-21T08:11:04.983913Z", + "iopub.status.idle": "2025-05-21T08:11:04.988860Z", + "shell.execute_reply": "2025-05-21T08:11:04.987924Z" + } + }, + "outputs": [], + "source": [ + "item = Item(item_id=1, quantity=2, price=300)\n", + "cart = ShoppingCart()\n", + "cart.add_item(item)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "execution": { + "iopub.execute_input": "2025-05-21T08:11:04.993017Z", + "iopub.status.busy": "2025-05-21T08:11:04.992632Z", + "iopub.status.idle": "2025-05-21T08:11:05.002172Z", + "shell.execute_reply": "2025-05-21T08:11:05.000955Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[ItemID:<1>]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# shopping cart now has items\n", + "cart._items" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "execution": { + "iopub.execute_input": "2025-05-21T08:11:05.006206Z", + "iopub.status.busy": "2025-05-21T08:11:05.005873Z", + "iopub.status.idle": "2025-05-21T08:11:05.013034Z", + "shell.execute_reply": "2025-05-21T08:11:05.011861Z" + } + }, + "outputs": [], + "source": [ + "import datetime\n", + "\n", + "\n", + "class OrderLog:\n", + " def __init__(self, order_number, status=OrderStatus.PENDING):\n", + " self._order_number = order_number\n", + " self._creation_date = datetime.date.today()\n", + " self._status = status\n", + " \n", + "\n", + "class Order:\n", + " def __init__(self, order_number, status=OrderStatus.PENDING):\n", + " self._order_number = order_number\n", + " self._status = status\n", + " self._order_date = datetime.date.today()\n", + " self._order_log = []\n", + " \n", + " def send_for_shipment(self):\n", + " pass\n", + " \n", + " def make_payment(self, payment):\n", + " pass\n", + " \n", + " def add_order_log(self, order_log):\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Shipment and Notification\n", + "After successfully placing an order and processing the payment, a shipment record will be created.\n", + "Let's define the Shipment and Notification classes:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "execution": { + "iopub.execute_input": "2025-05-21T08:11:05.017277Z", + "iopub.status.busy": "2025-05-21T08:11:05.016929Z", + "iopub.status.idle": "2025-05-21T08:11:05.024473Z", + "shell.execute_reply": "2025-05-21T08:11:05.023075Z" + } + }, + "outputs": [], + "source": [ + "import datetime\n", + "\n", + "\n", + "class ShipmentLog:\n", + " def __init__(self, shipment_id, status=ShipmentStatus.PENDING):\n", + " self._shipment_id = shipment_id\n", + " self.shipment_status = status\n", + "\n", + "\n", + "class Shipment:\n", + " def __init__(self, shipment_id, shipment_method, eta=None, shipment_logs=[]):\n", + " self._shipment_id = shipment_id\n", + " self._shipment_date = datetime.date.today()\n", + " self._eta = eta\n", + " self._shipment_logs = shipment_logs\n", + " \n", + "\n", + "class Notification(ABC):\n", + " def __init__(self, notification_id, content):\n", + " self._notification_id = notification_id\n", + " self._created_on = datetime.datetime.now()\n", + " self._content = content\n", + " " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.17" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/README.md b/README.md index 93ad702..4c23b42 100644 --- a/README.md +++ b/README.md @@ -6,24 +6,24 @@ A curated collection of approaches to creating large scale distributed systems d System Design Problems: -1. [Design Instagram](designing_instagram.ipynb) +1. [Design Instagram](designing_instagram.md) 2. [Design Twitter](designing_twitter.md) -3. [Design Twitter Search](designing_twitter_search.ipynb) -4. [Design Instagram Feed](designing_instagram_feed.ipynb) +3. [Design Twitter Search](designing_twitter_search.md) +4. [Design Instagram Feed](designing_instagram_feed.md) 5. [Design Youtube or Netflix](designing_youtube_or_netflix.md) 6. [Design Uber or Lyft](designing_uber_backend.md) 7. [Design a Typeahead Suggestion like Google Search](designing_typeahead_suggestion.md) -8. [Design an API Rate Limiter](designing_api_rate_limiter.ipynb) +8. [Design an API Rate Limiter](designing_api_rate_limiter.md) 9. [Design an E-ticketing service like Ticketmaster](designing_ticketmaster.md) -10. [Design a Web Crawler](designing_webcrawler.ipynb) -11. [Design Cloud File Storage like Dropbox/GDrive](designing-cloud-storage.ipynb) -12. [Distributed Logging](distributed_logging.ipynb) +10. [Design a Web Crawler](designing_webcrawler.md) +11. [Design Cloud File Storage like Dropbox/GDrive](designing-cloud-storage.md) +12. [Distributed Logging](distributed_logging.md)   Object Oriented Design Problems: -1. [Design Amazon: Online Shopping System](OOP-design/designing_amazon_online_shopping_system.ipynb) +1. [Design Amazon: Online Shopping System](OOP-design/designing_amazon_online_shopping_system.md) ## Step 0: Intro diff --git a/designing-cloud-storage.md b/designing-cloud-storage.md new file mode 100644 index 0000000..b8eca47 --- /dev/null +++ b/designing-cloud-storage.md @@ -0,0 +1,170 @@ +# Designing a Cloud Storage Service like Dropbox or Google Drive +Cloud file storage enables users to store their data on remote servers. + +Why cloud storage? +1. **Availability:** Users can access their files from any devices, anywhere, anytime. +2. **Durability and Reliability:** Cloud storage ensures that users never lose their data by storing their data on different geographically located servers. +3. **Scalability:** Users will never have to worry about running out of storage space, as long as they are ready to pay for it. + + + +## 1. Requirements and System Goals +What do we want to achieve from a cloud storage system? +1. Users should be able to upload/download files from any devices +2. Users should be able to share files and folders with other users +3. The service should support large files +4. The service should allow syncing of files between devices. Updating a file on a device should get synchronized on all other devices. +5. ACID-ity on all file operations should be enforced. +6. The service should support offline editing, and as soon as users come online, all changes synced. +7. The service should support snapshotting of data, so that users can go back to previous versions of it + +#### Design Considerations +- We should expect large read and write volumes, with the read/write ratio being about the same. +- Files must be stored in small chunks. This has a lot of benefits, such as if a user fails to upload a file, then only the failing chunk will be retried instead of the entire file. +- We can reduce on data exchange by transferring updated chunks only. And, for small changes, clients can intelligently upload the diffs instead of the whole chunk +- The system can remove duplicate chunks, to save storage space and bandwidth. +- We can prevent lots of round trips to the server if clients keep a local copy of metadata (file name, size etc) + +## 2. Capacity Estimation +```python +* Assume: 100 million users, 20 million DAU (daily active users) +* Assume: Each user has on average two devices +* Assume: Each user has on average about 100 files/photos/videos, we have + +Total files => 100,000,000 users * 100 files = 10 billion files + +* Assume: The average file size => 100KB, Total storage would be: + 0.1MB * 10B files => 1 PB(Petabyte) +``` + + + +## 3. High Level Design + +The user will have a folder as their workplace on their device. Any file/photo/folder placed inside this folder will be uploaded to the cloud. If changes are made to it, it will be reflected in the same way in the cloud storage. +- We need to store files and metadata (file name, size, dir, etc) and who the file is shared with. +- We need block servers to help the clients upload/download files to our service +- We need metadata servers to facilitate updating file and user metadata. +- We need a synchronization server to notify clients whenever an update happens so they can start synchronizing the updated files. +- We need to keep metadata of files updated in a NoSQL database. + +![](images/designing_cloud_high_level.png) + +## 4. Component Design +The major components of the system are as follows: + +### a. Client +The client monitors the workspace folder on the user's device and syncs changes to the remote cloud storage. +The main operations for the client are: +1. Upload and download files +2. Detect file changes in workspace folder +3. Resolve conflict due to offline or concurrent updates. + +#### Handling efficient file transfer +We can break files into small chunks, say 4MB. We can transfer only modified chunks instead of entire files. To get an optimal chunk size we can the following: +- Input/output operations per sec (IOPS) for our storage devices in our backend. +- Network bandwidth. +- Average file size storage. + +We should also keep a record of each file and the chunks that make up that file in our metadata servers. + +A copy of metadata can also be kept with the client to enable offline updates and save round trips to update the remote metadata. + +#### Syncing with other clients +We can use HTTP long polling to request info from the server. If the server has no new data for this client, instead of sending an empty response, it holds the request open and waits for response information to become available. Once new info is available, the server immediately sends a HTTP response to the client, completing the open request. + +#### Major parts of the Client +![](images/designing_cloud_client.png) + +1. **Internal Metadata DB:** to keep track of all files, chunks, versions, and locations in the file system. +2. **Chunker:** will split files into chunks, and reconstruct a file from its chunks. +3. **Watcher:** will monitor workspace folder and notify the indexer of user action (e.g CRUD operations), as well as listen for incoming sync changes broadcasted by `Sync Service`. +4. **Indexer:** will process events from the watcher and update the client DB with necessary chunk/update information on files. Once chunks are synced to the cloud, the indexer can communicated with `remote Sync Service` to broadcast changes to other clients and update the `Remote Metadata DB`. + +On client communication frequency: +> The client should exponentially back-off connection retries to a busy/slow server, and mobile clients should sync on demand to save on user's bandwidths/bundles and space. + + +### b. Metadata DB +The Metadata database can be a relational database like MySQL or a NoSQL DB like DynamoDB. +The Sync Service should be able to provide a consistent view of the files through a DB, especially if the file is being edited by more than one user. + +If we go with NoSQL for its scalability and performance, we can support ACID properties programmatically in the logic of our Sync Service. + +The objects to be saved in the Metadata NoSQL DB are as follows: +- Chunks +- Files +- User +- Devices +- Workspace Sync Folders + + +### c. Sync Service +This component will process file updates made by a client and apply changes to other subscribed clients. +It will sync local DB for the client with the info store in the remote Metadata DB. + +**Consistency and reliability:** When the Sync Service receives an update request, has a verification process. This process first checks with the Metadata DB for consistency before and proceeding with the update, ensuring data integrity. This step helps prevent conflicts and inconsistencies that could come about from concurrent updates from multiple clients. + +**Efficient Data Transfer:** By transmitting only the diffs between file versions instead of the entire file, bandwidth consumption and cloud data storage usage are minimized. This approach is benefitial especially for large files and frequent update scenarios. + +**Optimized storage:** The server and clients can calculate a hash using a collision resistant alogorithm (SHAs, Checksums or even Merkle trees) to see whether to update a copy of a chunk or not. On the server, if we already have a chunk with a similar hash, we don't need to create another copy, we can use the same chunk. The sync service will intelligently identify and reuse existing chunks, reducing redundancy and conversing storage space + +**Scalability through messaging middleware:** Adding a messaging middleware between clients and the Sync Service will allow us to provide scalable message queuing and change notifications, supporting a high number of clients using pull or push strategies. +Multiple Sync Service instances can receive requests from a global request queue, and the messaging middleware will be able to balance its load. + +**Future-Proofing for Growth:** By designing the system with scalability and efficiency in mind, it can accommodate increasing demands as the user base grows or usage patterns evolve. This proactive approach minimizes the need for major architectural changes or performance optimizations down the line, leading to a more sustainable and adaptable system architecture. + +### d. Message Queuing Service +This component supports asynchronous communication between client and Sync Service, and efficiently store any number of messages in a highly available, reliable and scalable queue. + +The service will have two queues: +1. **A Request Queue:** is a global queue which will receive client's request to update the Metadata DB. +From there, the Sync Service will take the message to update metadata. + +2. **A Response Queue:** will correspond to an individual subscribed client, and will deliver update messages to that client. Each message will be deleted from the queue once received by a client. Therefore, we need to create separate Response Queues for each subscribed client + +![](images/designing_cloud_mqs.png) + +## 5. File Processing Workflow +When Client A updates a file that's shared with Client B and C, they should receive updates too. If other clients +are not online at the time of update, the Message Queue Service keeps the update notifications in response queues until they come online. + +1. Client A uploads chunks to cloud storage. +2. Client A updates metadata and commits changes. +3. Client A gets confirmation and notifications are sent to Client B and C about the changes. +4. Client B and C receive metadata changes and download updated chunks. + +## 6. Data Deduplication +We'll use this technique to remove duplicateed copies of data to cut down storage costs. +For each incoming chunk, we calculate a hash of it and compare it with hashes of the existing chunks to see if we have a similar chunk that's already saved. + +Two ways to do this: + +a. **In-line deduplication:** do hash calculations in real-time as clients enter the data on the device. If an existing chunk has the same hash as a new chunk, we store a reference to the existing chunk as metadata. This prevents us from making a full copy of the chunk, saving on network bandwidth and storage usage. + +b. **Post-process deduplication:** store new chunks and later some process analyzes the data looking for duplicated chunks. The benefit here is that clients don't need to wait for hash lookups to complete storing data. This ensures there's no degradation in storage performance. The drawback is that duplicate data will consume bandwidth, and we will also unnecessarily store it, but only for a short time. + +## 7. Partitioning Metadata DB +To scale our metadata DB, we can partition it using various partition schemes: + +We can use Range-based partitioning where we store files/chunks in separate partitions based on the first letter of the file path. But, later this might lead to unbalanced servers, where partitions that start with frequently occuring letters will have more files than those that dont. + +For Hash-based partitioning, we can take a hash of the object and use it to determine the DB partition to save the object. A hash on the `FileID` of the File object we are storing can be used to determine the partition to store the object. + +The hashing function will distribute objects into different partitions by mapping them to a number between `[1,...,256]` and this number would be the partition we store the object. And to prevent overloading some partitions, we can use `Constitent Hashing`. + +## 8. Load Balancer +We can add the load balancer at two places: +1. Between Client and Block servers +2. Between Client and Metadata servers + +![](images/designing_cloud_detailed.png) + +We can have a round robin load balancer that distributes incoming requests equally among backend servers. But if a server is overloaded or slow, the LB will not stop sending new requests to that server. To handle this, a more intelligent LB strategy can be implemented such that it queries for a backend server load before it sends traffic to that server, and adjusts traffic to a server based on its current server load. + +## 9. Caching +To deal with hot frequently used files/chunks, we can create a cache for block storage. We'll store whole chunks +and the system can cechk if the cache has the desired chunk before hitting Block storage. + +LRU eviction policy can be used to discard the least recently used chunk first. +We can also introduce a cache for metadata DB for hot metadata records. diff --git a/designing_api_rate_limiter.md b/designing_api_rate_limiter.md index 88ae138..e1938b5 100644 --- a/designing_api_rate_limiter.md +++ b/designing_api_rate_limiter.md @@ -1,280 +1,275 @@ - # Designing an API Rate Limiter - -An API reate limiter will throttle users based on the number of requests they are sending. - -## Why Rate Limiting? -Rate limiting is all about traffic/load management. Clients can misbehave and trigger a retry storm, others might request the same information over and over again, increasing traffic and unnecessary load on the system. This can degrade the performance of the service, and the ability of the system to reliably handle incoming requests. - -Rate limiting helps to protect services against abusive behaviors targeting the application such as retry storms, Denial-of-service (DOS) attacks, brute-force password attempts, brute-force credit card transactions, etc. - -Rate limiting saves the company money by eliminating service and infrastructure costs that would have otherwise been used to handle spamming. - -Here are some scenarios to show the importance of Rate limiting our API/Services: - -- **Prevents service degradation:** It reduces traffic spikes so that the service stays reliable for all users. - -- **Misbehaving clients:** Sometimes, clients can overwhelm servers by sending large number of requests, either intentionally or unintentionally. - -- **Security:** Limiting the number of times a user should authenticate with a wrong password. - -- **Preventing abusive and bad design practices:** Without API limits, developers of client apps might request the same info over and over again. - -- **Revenue:** Services can limit operations based on the tier of their customer's service and thus create a revenue model off the rate limiting. To go beyond the set limit, the user has to buy higher limits. - - -## 1. Requirements and System Goals - -#### Functional requirements -1. Limit the number of requests an entity can send to an API within a time window. -2. The user should get an error whenever they cross the defined threshold within a single server or across a set of servers. - -#### Non-Functional requirements -1. The system should be highly available, always protecting our API service from external attacks. -2. The rate limiter should NOT introduce substantial latencies affecting the user experience. - -## 2. Throttling Types - -* ***Hard Throttling*** – Number of API requests cannot exceed the throttle limit. - -* ***Soft Throttling*** – Set the API request limit to exceed by some percentage. E.g, if the rate-limit = 100 messages/minute, and 10% exceed-limit, our rate limiter will allow up to 110 messages per minute. - -* ***Dynamic Throttling (Priority throttling)*** – The number of requests can exceed the limit if the system has some free resources available. The system can progressively throttle requests based on some predefined priority. - -## 3. Algorithms used for Rate Limiting - -#### Fixed Window Algorithm -Here, the time window is considered from the start of the time-unit to the end of the time-unit. -For instance, a period will be considered 0-60 sec for a minute regardless of the time frame at which the API request has been made. - -The diagram below shows that we will only throttle 'm5' message, if our rate limit is 2 messages per second. - -![](images/fixed_rolling_window.svg) - -#### Rolling Window Algorithm -The time window is considered from the fraction of time at which the request is made plus the time window length. - -With a rate limit of 2 messages per second, the two messages sent at the 300th millisecond (m1) and 400th millisecond (m2), we'll count them as two messages starting from the 300th of that second to the 300th of the next second (making up one second). - -As a result, we will therefore throttle M3, M4 as shown above. - - - - -## 4. High Level Design - -Once a new request arrives, the Web server first asks the Rate Limiter to decide if it will be served or throttled. If the request is not throttled, then it's passed to the API servers. - -![](images/rate_limiter_high_level_design.png) - - -## 5. Basic System Design and Algorithm - -Assume for each user, our rate limiter allows 3 requests/sec. - -For each user, store: -- a request count (how many requests the user has made) -- a timestamp when we started counting - -We can keep this in a hashtable, where: - -```python -# Key (userID): Value {count, start_time} -hashtable = { - 'userId0': { - 'count': 3, 'start_time': 1574866492 - }, - 'userId1': { - 'count': 1, 'start_time': 1574873716 - }, - ... -} -``` - -When a new request comes in, the rate limiter will perform the following steps: - -1. If the `userID` does not exist in the hash-table, - - insert it, - - set `count` to 1 and set `start_time` to current epoch time -2. Otherwise, find the existing record of the userID, and - - if `current_time - start_time >= 1 minute`, reset `start_time` to be current time, - - set `count` to 1 and allow the request -3. If `current_time - start_time <= 1 minute` and - - If `count < 3`, increment the count and allow the request. - - If `count == 3`, reject the request. - -#### Problems with this Fixed Window Algorithm -1. We are resetting the `start_time` at the end of every minute, which means we can potentially allow twice the number of requests per minute. - -Imagine if a user sends 3 requests at the last second of a minute, they can immediately send 3 more requests at the very first second of the next minute, resulting in 6 requests in a span of two seconds. - -To fix this loophole, we'll use the sliding-window algorithm. - -![](images/fixed_window_problem.svg) - -2. Atomicity: The read and then write process can create a race condition. Imagine, a given user's current count = 2. If two seperate processes served each of these requests and concurrently read the count before either updated it, each process would erroneously think that the user had one more request to hit the rate limit. - -![](images/fixed_window_atomicity.svg) - - -#### Solutions -We can use a K-V store like Redis to store our key-value and solve the atomicity problem using [Redis lock](https://redis.io/topics/distlock) for the duration of the read-update operation. -This however, would slow down concurrent requests from the same user and introduce another layer of complexity. - -  - -#### How much memory to store all the user data? -Assume the userID takes 8 bytes, epoch time needs 4 bytes, 2 bytes for count: -``` -8 + 2 + 4 = 14 bytes -``` -Let's assume our hash-table has an overhead of 30 bytes for each record. If we need to track 10 million users at any time: -``` -Total memory = (14 + 30) bytes * 10 million users = 440,000,000 bytes => 440MB memory. -``` - -If we assume that we need a 4-byte number to lock each user's record to solve the atomicity problems -``` -Total memory = 4 bytes for lock + (14 + 30) bytes * 10 million users = 480,000,000 bytes => 480MB memory. -``` - -This can fit in a single server. However, we wouldn't want to route all traffic in a single machine because of availability issue => that one server goes down for any reason, our only instance of the rate limiter service goes down with it. - -For instance, when rate limiting 1M users at 10 req/sec, this would be about 10 million queries per second for our rate limiter. This would be too much to handle for a single server. Practically, we can use Redis or Memcached in a distributed setup. We'll store all the data in remote Redis servers and all the Rate limiter servers will read/update these servers before serving or throttling any request. - - - -## 6. Sliding Window Algorithm - -We can maintain a sliding window if we can keep track of each request per user. - -We will store the timestamp of each request in a [Redis Sorted Set](https://redis.io/docs/data-types/sorted-sets/). - -```python -hash_table = { - # userID: { Sorted Set } - 'userID-0': {1574860105, 1574881109, 1574890217 }, - 'userID-1': {1574866488, 1574866493, 1574866499} - ... -} -``` -Assume our rate limiter allows 3 requests/sec per user. - -When a new request comes in, the rate limiter will perform the following steps: - -1. Remove all timestamps from Sorted Set older than `1 second`. -2. Count elements in the set. Reject request if the count is greater than our throttling limit (3 for our case). -3. Insert current time in the sorted set and accept the request. - -#### Memory for storing user data? -Assume UserId takes 8 bytes, each epoch time takes 4 bytes. - -Now suppose we need a rate limiting of 500 requests per hour. Assume 20 bytes of overhead for hash-table and 20 bytes for sorted set. - -``` -(8 + 4 + 20 bytes sorted set overhead) * 500 + (20 bytes hash-table overhead) = 12KB -``` - -If we need to track 10 million users at any time: -``` -Total memory = 12KB * 10 million users = 11718.75 MB ≈ 120 GB -``` - -For 10M users, sliding window takes a lot of memory compared to fixed window; this won't scale well. We can combine the above two algorithms to optimize our memory usage. - - -## 7. Sliding Window + Counters -What if we keep track of request counts for each user using multiple fixed time windows. - -For example, if rate limit is hourly, keep a count for **each minute** and calculate the sum of all counter in the past hour when we receive a new request. - -This reduces our memory footprint. Consider a rate-limit at 500 requests/hour, with an additional limit of 10 requests/minute. *This means that when the sum of the counters with timestamps in the past hour `> 500`, the user has exceeded the rate limit.* - -In addition, the user can't send more than 10 requests per minute. This would be a reasonable and practical consideration, as none of the real users would send frequent requests. Even if they do, they'll see success with retries since their limits get reset every minute. - - -We can store our counter in a [Redis Hash](https://redis.io/docs/data-types/hashes/) because it's very efficient in storing <100 keys. -Which each request, increment a counter in the hash, it also sets the hash to [expire](https://redis.io/commands/ttl) an hour later. We will normalize each time to a minute. - - -``` -Rate limiting allowing 3 requests per minute for User1 - -[Allow request] 7:00:00 AM ---- "User1": {1574860100: 1} -[Allow request] 7:01:05 AM ---- "User1": { 1574860100: 1, 1574860160: 1} -[Allow request] 7:01:20 AM ---- "User1": { 1574860100: 1, 1574860160: 2} -[Allow request] 7:01:20 AM ---- "User1": { 1574860100: 1, 1574860160: 3} -[Reject request] 7:01:45 AM ---- "User1": { 1574860100: 1, 1574860160: 3} -[Allow request] 7:02:20 AM ---- "User1": { 1574860160: 3, 1574860220: 1} -``` - -#### How much memory to store all user data? - -We'll need: -``` - -UserID = 8 bytes -Counter = 2 bytes -Epoch time = 4 bytes - -Since we keep a count per minute, at max, we need 60 entries per user -8 + (4 + 2 + 20 Redis-hash overhead) * 60 entries + 20 Hash-table overhead = 1.6KB -``` - -If we need to track 10 million users at any time: -``` - -Total memory = 1.6KB * 10 million => 16 GB -( 92% less memory than the simple sliding window algorithm ) - -``` - - -# 8. Data Sharding and Caching -We can shard by `UserID` to distribute user data across different partitions. - -For fault tolerance and replication we should use Consistent Hashing. Consistent hashing is a very useful strategy for distributed caching system and DHTs. It allows us to distribute data across a cluster in such a way that will minimize reorganization when nodes are added or removed (resizing). - -  - -## Caching -We can get huge benefits from caching recent active users. - -Our app servers can quickly check if the cache has the desired record before hitting backend servers. Our rate limiter can benefit from the **Write-back cache** by updating all counters and timestamps in cache only. The write to the permanent storage can be done at fixed intervals. This way we can ensure minimum latency added to the user's request by the rate limiter. The reads can always hit the cache first; which will be extremely useful once the user has hit the rate limit and the rate limiter will only be reading data without any updates. - -Least Recently Used (LRU) can be a reasonable eviction policy for the our cache. - - -Sample response: - - - -# 9. Throttling Response -We can return a 429 status code: Too Many Requests whenever the user exceeds the rate limit. - -| Header Name | Description | -| :------------------- | :---------------------------------------------------------------------------- | -| RateLimit-Exceeded | The specific limit that has been exceeded. | -| Retry-After | The number of seconds that the client should wait before retrying a request. | - - -``` -HTTP/1.1 429 Too Many Requests -Transfer-Encoding: chunked -Retry-After: 5 -request-id: ff4751b7-d289-01a0-a6dd-9b3541c077fe -RateLimit-Exceeded: 60 -Cache-Control: private -Content-Type: application/json;charset=utf-8 -``` -```json -{ - "error": { - "code": "ClientThrottled", - "message": "Client application is over its resource limit." - } -} - - -```python - -``` + # Designing an API Rate Limiter + +An API reate limiter will throttle users based on the number of requests they are sending. + +## Why Rate Limiting? +Rate limiting is all about traffic/load management. Clients can misbehave and trigger a retry storm, others might request the same information over and over again, increasing traffic and unnecessary load on the system. This can degrade the performance of the service, and the ability of the system to reliably handle incoming requests. + +Rate limiting helps to protect services against abusive behaviors targeting the application such as retry storms, Denial-of-service (DOS) attacks, brute-force password attempts, brute-force credit card transactions, etc. + +Rate limiting saves the company money by eliminating service and infrastructure costs that would have otherwise been used to handle spamming. + +Here are some scenarios to show the importance of Rate limiting our API/Services: + +- **Prevents service degradation:** It reduces traffic spikes so that the service stays reliable for all users. + +- **Misbehaving clients:** Sometimes, clients can overwhelm servers by sending large number of requests, either intentionally or unintentionally. + +- **Security:** Limiting the number of times a user should authenticate with a wrong password. + +- **Preventing abusive and bad design practices:** Without API limits, developers of client apps might request the same info over and over again. + +- **Revenue:** Services can limit operations based on the tier of their customer's service and thus create a revenue model off the rate limiting. To go beyond the set limit, the user has to buy higher limits. + + +## 1. Requirements and System Goals + +#### Functional requirements +1. Limit the number of requests an entity can send to an API within a time window. +2. The user should get an error whenever they cross the defined threshold within a single server or across a set of servers. + +#### Non-Functional requirements +1. The system should be highly available, always protecting our API service from external attacks. +2. The rate limiter should NOT introduce substantial latencies affecting the user experience. + +## 2. Throttling Types + +- ***Hard Throttling*** – Number of API requests cannot exceed the throttle limit. + +- ***Soft Throttling*** – Set the API request limit to exceed by some percentage. E.g, if the rate-limit = 100 messages/minute, and 10% exceed-limit, our rate limiter will allow up to 110 messages per minute. + +- ***Dynamic Throttling (Priority throttling)*** – The number of requests can exceed the limit if the system has some free resources available. The system can progressively throttle requests based on some predefined priority. + +## 3. Algorithms used for Rate Limiting + +#### Fixed Window Algorithm +Here, the time window is considered from the start of the time-unit to the end of the time-unit. +For instance, a period will be considered 0-60 sec for a minute regardless of the time frame at which the API request has been made. + +The diagram below shows that we will only throttle 'm5' message, if our rate limit is 2 messages per second. + +![](images/fixed_rolling_window.svg) + +#### Rolling Window Algorithm +The time window is considered from the fraction of time at which the request is made plus the time window length. + +With a rate limit of 2 messages per second, the two messages sent at the 300th millisecond (m1) and 400th millisecond (m2), we'll count them as two messages starting from the 300th of that second to the 300th of the next second (making up one second). + +As a result, we will therefore throttle M3, M4 as shown above. + + + + +## 4. High Level Design + +Once a new request arrives, the Web server first asks the Rate Limiter to decide if it will be served or throttled. If the request is not throttled, then it's passed to the API servers. + +![](images/rate_limiter_high_level_design.png) + + +## 5. Basic System Design and Algorithm + +Assume for each user, our rate limiter allows 3 requests/sec. + +For each user, store: +- a request count (how many requests the user has made) +- a timestamp when we started counting + +We can keep this in a hashtable, where: + +```python +# Key (userID): Value {count, start_time} +hashtable = { + 'userId0': { + 'count': 3, 'start_time': 1574866492 + }, + 'userId1': { + 'count': 1, 'start_time': 1574873716 + }, + ... +} +``` + +When a new request comes in, the rate limiter will perform the following steps: + +1. If the `userID` does not exist in the hash-table, + - insert it, + - set `count` to 1 and set `start_time` to current epoch time +2. Otherwise, find the existing record of the userID, and + - if `current_time - start_time >= 1 minute`, reset `start_time` to be current time, + - set `count` to 1 and allow the request +3. If `current_time - start_time <= 1 minute` and + - If `count < 3`, increment the count and allow the request. + - If `count == 3`, reject the request. + +#### Problems with this Fixed Window Algorithm +1. We are resetting the `start_time` at the end of every minute, which means we can potentially allow twice the number of requests per minute. + +Imagine if a user sends 3 requests at the last second of a minute, they can immediately send 3 more requests at the very first second of the next minute, resulting in 6 requests in a span of two seconds. + +To fix this loophole, we'll use the sliding-window algorithm. + +![](images/fixed_window_problem.svg) + +2. Atomicity: The read and then write process can create a race condition. Imagine, a given user's current count = 2. If two seperate processes served each of these requests and concurrently read the count before either updated it, each process would erroneously think that the user had one more request to hit the rate limit. + +![](images/fixed_window_atomicity.svg) + + +#### Solutions +We can use a K-V store like Redis to store our key-value and solve the atomicity problem using [Redis lock](https://redis.io/topics/distlock) for the duration of the read-update operation. +This however, would slow down concurrent requests from the same user and introduce another layer of complexity. + +  + +#### How much memory to store all the user data? +Assume the userID takes 8 bytes, epoch time needs 4 bytes, 2 bytes for count: +``` +8 + 2 + 4 = 14 bytes +``` +Let's assume our hash-table has an overhead of 30 bytes for each record. If we need to track 10 million users at any time: +``` +Total memory = (14 + 30) bytes * 10 million users = 440,000,000 bytes => 440MB memory. +``` + +If we assume that we need a 4-byte number to lock each user's record to solve the atomicity problems +``` +Total memory = 4 bytes for lock + (14 + 30) bytes * 10 million users = 480,000,000 bytes => 480MB memory. +``` + +This can fit in a single server. However, we wouldn't want to route all traffic in a single machine because of availability issue => that one server goes down for any reason, our only instance of the rate limiter service goes down with it. + +For instance, when rate limiting 1M users at 10 req/sec, this would be about 10 million queries per second for our rate limiter. This would be too much to handle for a single server. Practically, we can use Redis or Memcached in a distributed setup. We'll store all the data in remote Redis servers and all the Rate limiter servers will read/update these servers before serving or throttling any request. + + + +## 6. Sliding Window Algorithm + +We can maintain a sliding window if we can keep track of each request per user. + +We will store the timestamp of each request in a [Redis Sorted Set](https://redis.io/docs/data-types/sorted-sets/). + +```python +hash_table = { + # userID: { Sorted Set } + 'userID-0': {1574860105, 1574881109, 1574890217 }, + 'userID-1': {1574866488, 1574866493, 1574866499} + ... +} +``` +Assume our rate limiter allows 3 requests/sec per user. + +When a new request comes in, the rate limiter will perform the following steps: + +1. Remove all timestamps from Sorted Set older than `1 second`. +2. Count elements in the set. Reject request if the count is greater than our throttling limit (3 for our case). +3. Insert current time in the sorted set and accept the request. + +#### Memory for storing user data? +Assume UserId takes 8 bytes, each epoch time takes 4 bytes. + +Now suppose we need a rate limiting of 500 requests per hour. Assume 20 bytes of overhead for hash-table and 20 bytes for sorted set. + +``` +(8 + 4 + 20 bytes sorted set overhead) * 500 + (20 bytes hash-table overhead) = 12KB +``` + +If we need to track 10 million users at any time: +``` +Total memory = 12KB * 10 million users = 11718.75 MB ≈ 120 GB +``` + +For 10M users, sliding window takes a lot of memory compared to fixed window; this won't scale well. We can combine the above two algorithms to optimize our memory usage. + + +## 7. Sliding Window + Counters +What if we keep track of request counts for each user using multiple fixed time windows. + +For example, if rate limit is hourly, keep a count for **each minute** and calculate the sum of all counter in the past hour when we receive a new request. + +This reduces our memory footprint. Consider a rate-limit at 500 requests/hour, with an additional limit of 10 requests/minute. *This means that when the sum of the counters with timestamps in the past hour `> 500`, the user has exceeded the rate limit.* + +In addition, the user can't send more than 10 requests per minute. This would be a reasonable and practical consideration, as none of the real users would send frequent requests. Even if they do, they'll see success with retries since their limits get reset every minute. + + +We can store our counter in a [Redis Hash](https://redis.io/docs/data-types/hashes/) because it's very efficient in storing <100 keys. +Which each request, increment a counter in the hash, it also sets the hash to [expire](https://redis.io/commands/ttl) an hour later. We will normalize each time to a minute. + + +``` +Rate limiting allowing 3 requests per minute for User1 + +[Allow request] 7:00:00 AM ---- "User1": {1574860100: 1} +[Allow request] 7:01:05 AM ---- "User1": { 1574860100: 1, 1574860160: 1} +[Allow request] 7:01:20 AM ---- "User1": { 1574860100: 1, 1574860160: 2} +[Allow request] 7:01:20 AM ---- "User1": { 1574860100: 1, 1574860160: 3} +[Reject request] 7:01:45 AM ---- "User1": { 1574860100: 1, 1574860160: 3} +[Allow request] 7:02:20 AM ---- "User1": { 1574860160: 3, 1574860220: 1} +``` + +#### How much memory to store all user data? + +We'll need: +``` + +UserID = 8 bytes +Counter = 2 bytes +Epoch time = 4 bytes + +Since we keep a count per minute, at max, we need 60 entries per user +8 + (4 + 2 + 20 Redis-hash overhead) * 60 entries + 20 Hash-table overhead = 1.6KB +``` + +If we need to track 10 million users at any time: +``` + +Total memory = 1.6KB * 10 million => 16 GB +( 92% less memory than the simple sliding window algorithm ) + +``` + + +## 8. Data Sharding and Caching +We can shard by `UserID` to distribute user data across different partitions. + +For fault tolerance and replication we should use Consistent Hashing. Consistent hashing is a very useful strategy for distributed caching system and DHTs. It allows us to distribute data across a cluster in such a way that will minimize reorganization when nodes are added or removed (resizing). + +  + +## Caching +We can get huge benefits from caching recent active users. + +Our app servers can quickly check if the cache has the desired record before hitting backend servers. Our rate limiter can benefit from the **Write-back cache** by updating all counters and timestamps in cache only. The write to the permanent storage can be done at fixed intervals. This way we can ensure minimum latency added to the user's request by the rate limiter. The reads can always hit the cache first; which will be extremely useful once the user has hit the rate limit and the rate limiter will only be reading data without any updates. + +Least Recently Used (LRU) can be a reasonable eviction policy for the our cache. + + +Sample response: + + + +## 9. Throttling Response +We can return a 429 status code: Too Many Requests whenever the user exceeds the rate limit. + +| Header Name | Description | +| :------------------- | :---------------------------------------------------------------------------- | +| RateLimit-Exceeded | The specific limit that has been exceeded. | +| Retry-After | The number of seconds that the client should wait before retrying a request. | + + +``` +HTTP/1.1 429 Too Many Requests +Transfer-Encoding: chunked +Retry-After: 5 +request-id: ff4751b7-d289-01a0-a6dd-9b3541c077fe +RateLimit-Exceeded: 60 +Cache-Control: private +Content-Type: application/json;charset=utf-8 +``` +```json +{ + "error": { + "code": "ClientThrottled", + "message": "Client application is over its resource limit." + } +} diff --git a/designing_instagram.md b/designing_instagram.md index 8393e06..d7ff7d9 100644 --- a/designing_instagram.md +++ b/designing_instagram.md @@ -9,15 +9,15 @@ We plan to design a simpler version of Instagram, where a user can share photos ## 1. Requirements and Goals of the System #### Functional requirements -1. Users should be able to upload/download/view photos -2. Users can perform searches baed on photo/video titles -3. Users can follow other users -4. The system should generate Newsfeed consisting top photos from all the people the user follows +- Users should be able to upload/download/view photos +- Users can perform searches baed on photo/video titles +- Users can follow other users +- The system should generate Newsfeed consisting top photos from all the people the user follows #### Non-functional requirements -1. The service needs to be highly available -2. The acceptable latency is 200ms for News Feed generation -3. The system should be highly reliable; any uploaded photo/video should never be lost. +- The service needs to be highly available +- The acceptable latency is 200ms for News Feed generation +- The system should be highly reliable; any uploaded photo/video should never be lost. ## 2. Capacity Estimation and Constraints @@ -219,7 +219,7 @@ To improve the efficiency, we can pre-generate the News Feed and store it in a s **1. Pull**: Clients pull content from server on a regular/ or manually. Problems: - New data not showing until client issues a pull request -- Most of the time pull requessts will result in empty response if there's no data. (Frustrating the user) +- Most of the time pull requests will result in empty response if there's no data. (Frustrating the user) **2. Push:** Servers push new data to users as soon as it is available. Users maintain a long poll request with the server. A possible problem is, a user who follows a lot of people or a celebrity user who has millions of followers; the server will have to push lots of updates quite frequently, straining the server. @@ -229,7 +229,7 @@ Problems: -# 10. Cache and Load balancing +## 10. Cache and Load balancing Our service will need a massive-scale photo delivery system to serve the globally distributed users. @@ -241,9 +241,4 @@ For cache eviction, we can use Least Recently User (LRU), where we discard the l #### **How can we build a more intelligent cache?** -If we go with 80-20 rule, 20% of photo reads generates 80% of traffic. This means that certain photos are so popular that the majority of people view/search them. Therefore, we can try caching 20% of daily read volume of photos and metadata. - - -```python - -``` +If we go with 80-20 rule, 20% of photo reads generates 80% of traffic. This means that certain photos are so popular that the majority of people view/search them. Therefore, we can try caching 20% of daily read volume of photos and metadata. diff --git a/designing_instagram_feed.md b/designing_instagram_feed.md new file mode 100644 index 0000000..2a7ca7d --- /dev/null +++ b/designing_instagram_feed.md @@ -0,0 +1,306 @@ +# Designing Instagram Feed + +A feed is a constantly updating scrollable list of posts, photos, videos, and status updates from all the people and pages a user follows. + + +## 1. Requirements and System Goals + +### Functional requirements +- Feeds may contain images, videos and text. +- Feeds are generated from the posts belonging to the pages and people the user follows. +- The service should support appending new posts as they arrive to the feed for all active users + +### Non-functional requirements +- The system should be able to generate a user's newsfeed in real-time - maximum latency seen by the end user should be about 2s. + +## 2. Capacity Estimation and Constraints +Assume on average a user has 300 friends and follows 200 pages. + +**Traffic Estimates:** A typical user checks their feed about 5 times per day on average. If we have 200 million daily active users, then: + +```text +Per day: 200M * 5 => 1B requests + +Per second: 1B / 86400 sec => 11500 reqs/sec +``` + +**Storage estimates:** Assume we have on average 500 posts for each user's feed that we want to keep in memory for fast fetching. +For simplicity, let's also assume an average photo file size posted would be about 200KB. This means we need about 200KB X 500 = 10 MB per user. +To store all this data for all active users, we'll need: + +```text + 200 M active users * 10MB ~= 2 Petabyte of memory +``` + +If a server can hold 100 GB memory, we'd need about 20000 machines to keep the top 500 posts in memory. + + +## 3. System APIs +> By defining the system APIs, you are explicitly stating what is expected from the system + +We'll have REST APIs to expose our service's functionality. + +**Getting User Feed** + +```python +getUserFeed( + api_dev_key: int, # Key of a registered user, used to throttle users based on their allocated quota. + user_id: int, # The Id of the user whom the system will generate the feed. + since_id: int, # (Optional) Return results with IDs more recent than this ID. + count: int , # (Optional) Specifies number of feed items to try and retrieve. + max_id: int, # (Optional) Returns results with IDs younger than the specified ID. + exclude_replies # (Optional) Prevents replies from appearing in the results. +``` + +**Returns:** (JSON) object containing a list of feed items. + +## 4. Database Design +There are three major objects: User, Entity (Business Accounts, Brands, Pages etc) and Post (A feed item). + * A user follows entities and other users. + * Users and entities can both post a Post which can contain text, images, or videos + * Each Post has a UserID of the user who created it. + * For simplicity, let's assume only users can create a post. + * A Post can optionally have an EntityID that points to the page or business entity where the post was created. + +If we use a relational DB, we can model two relations: User-to-Entity and Post-to-Media relation. Since each user can be friends with many people and follow a lot of entities, we can store this relation in a separate table. + +| Users | | +|:----:|:-----------------------| +|PK | **UserID: int** | +| | Name: varchar(32) | +| | Email: varchar(32) | +| | DOB: datetime | +| | CreatedAt: datetime | +| | LastLogin: datetime | + +  + + | Entity | | +|:----:|:-----------------------| +|PK | **EntityID: int** | +| | Name: varchar(32) | +| | Type: int | +| | Email: varchar(32) | +| | Description: varchar(512) | +| | Phone: varchar(12) | +| | CreatedAt: datetime | +| | LastLogin: datetime | + + +  + +| UserFollow | | +|:----:|:-----------------------| +|PK | (**UserID, EntityOrFriendID: int**) | +| | Type: enum (0, 1) | + +The **Type** column above identifies if an entity being followed is a user or an entity. +We can also have a table for the Media to Post relation. + +  + +| Post | | +|:----:|:-----------------------| +|PK | **PostID: int** | +| | UserID: int | +| | Contents: varchar(256) | +| | EntityID: int | +| | Latitute: int | +| | Longitude: int | +| | CreatedAt: datetime | +| | Likes: int | + +  + +| Media | | +|:----:|:-----------------------| +| | MediaID: int | +| | Type: enum | +| | Description: varchar(256) | +| | Path: int | +| | Latitute: int | +| | Longitude: int | +| | CreatedAt: datetime | + +  + +| PostMedia | | +|:----:|:-----------------------| +|PK | (**PostID, MediaID: int**) | + + + +## 5. High Level System Design +At a high level we have two system parts: +Feed generation and Feed publishing. + +#### Feed Generation +Whenever our system receives a request to generate a feed for a user, we'll perform these steps: + 1. Get all UserIDs and EntityIDs that the user follows + 2. Retrieve latest and most popular posts for those IDs + 3. Rank them based on relevance to the user. This is the user's current feed. + 4. Store the feed in a cache. + 5. Return top posts to be rendered on the user's feed. + 6. On front-end, when the user reaches the end of the loaded feed, fetch the next posts from the cache server. + 7. Periodically rank and add new posts to the user's feed. + 8. Notify user that there are new posts + +#### Feed Publishing +When the user loads her feed, she has request and pull posts from the server. When she reaches the end of her current feed, the server can push new posts. + +Should the server notify the user then the user can pull, or should the server just push new posts? + +At a high level, we'll have the following components: + +1. Web Servers: maintain the connection to the user to allow data transfer between user client and server. +2. Application Server: executes the work of storing new posts in the DB servers, as well as retrieval from the DB and pushing the feed to the user. +3. Metadata DB and Cache: store metadata about Users, Pages, Businesses, etc +4. Post DB and Cache: to store metadata about posts and their contents +5. Video/Photo storage and Cache: Blob storage to store all media in the posts +6. Feed generation service: to get and rank relevant posts for a user and generate the feed, and store in the cache. +7. Feed notification service: to notify user that there are newer feed posts + +## 6. Detailed Component Design + +Let's look at generating the feed. The query would look something like this: + +~~~mysql +SELECT PostID FROM Post WHERE UserID IN + (SELECT EntityOrFriendID FROM UserFollow WHERE UserID = AND type = 0) -- user + UNION +SELECT PostID from Post WHERE EntityID IN + (SELECT EntityID FROM UserFollow WHERE UserID = AND type = 1) -- entity +ORDER BY CreatedAt DESC +LIMIT 100 +~~~ +We want to avoid a direct query to the DB due to high latency. + +We also want to avoid generating the feed when a user loads the page because it will be slow and have a high latency. + +Also, the server notifying about new posts to user with lots of followers could lead to heavy loads. To improve on all this, we can pre-generate the feed and store it in memory. + +### Offline generation +We can have severs dedicated to continuously generate feeds and store in memory. When a user requests for the new posts, we can simply serve it from the stored location. Therefore a user's feed is compiled not on load but on a regular basis and returned to users whenever they request it. + + +When the servers need to generate feed for a user, we first query to see last time the feed was generated. New feed will be generated from that time onwards. + +We can store this data in a hash table where key = UserID and value is: +```c +Struct { + LinkedHashMap posts; + DateTime lastGenerated; +} +``` + +We can store the PostIDs in a Linked HashMap (A hash table + doubly-linked list implementation), which will allow for jumping to any post at constant time but also iterate through the map easily. (The linked list maintains the order in which keys were inserted into the map) + +When fetching new posts, the client sends the last PostID the user currently sees in their feed, then the server can jump to that PostID in our hashmap and return next batch of posts from there. + +#### How many feeds should we store in memory? +Initially we can store 500 posts per user, but this number can be adjusted based on the usage pattern for a user. Users who never browse past 10 feeds can have 100 posts in memory. + +#### Should we generate for all users? +No. Lots of users won't log in frequently. +We can use a LRU based cache to remove users from memory that haven't accessed their feed for a long time. + +We can also use machine learning to pre-generate their feed based on their login patterns. + +### Feed Publishing +The process of pushing a post to all followers is call **fanout**. + +Two approaches to publishing: +1. **Pull model (Fanout on load):** Clients pull data either on intervals or manually when needed. The problem with this approach is + + a. New data might not be shown to users until they do a pull request + + b. Most of the time pulls return empty responses: a wasted resource that could have been avoided. + +2. **Push model (Fanout on write):** Immediately push a post to all followers once a user posts it. Advantage here is you don't need to iterate through your friends list to get their feeds, thus significantly reducing read operations. Users have to maintain a long-poll request with the server for receiving updates. A possible problem with this approach is when a celeb user has millions of followers, the server has to push updates to a lot of people. + +3. **Hybrid:** We can combine push and pull models. We stop pushing posts from celeb users with lots of followers. We can let their followers pull updates. By doing this, we can save a huge number of fanout resources. Alternatively, we can limit the push fanout to only followers who are online. + +#### How many feeds should we return to client? +Say 20 per request. Also, different clients (mobile vs desktop) fetch different number of posts due to differences in screen size and bandwidth usage. + +We can notify the users on desktop where data usage is cheap. For mobile devices, data usage is expensive, so we can choose not to push data but instead to let users *pull to refresh* to get new posts. + +## 7. Feed Ranking +To rank posts in a newsfeed, we can use creation time of the posts. However today's +ranking algorithms are doing more to ensure important posts are ranked higher. + +The idea is to select key **features** that make a post important, +combining them and calculating the final ranking score. + +These features include: +* creation time +* number of likes +* number of comments +* number of shares, +* time of the updates + +We can also check the effectiveness of the ranking system by evaluating if it has increased user retention and add revenue etc. + + + +## 8. Data Partitioning +#### a. Sharding Posts and metadata +Since we have a huge number of new posts every day and our read load is extremely high, we need to distribute data across multiple machines for better read/write efficiency. + +#### Sharding based on UserID +We can try storing a user's data on one server. While storing: +- Pass a UserID to our hash function that will map the user to a DB server where we'll store all of their data. +- While querying for their data, we can ask the hash function where to find it and read it from there. + +Issues: +1. What if a user is IG famous? There will be lots of queries on the server holding that user. This high load will affect the service's performance. +2. Over time, some users will have more data compared to others. Maintaining a uniform distribution of growing data across servers is quite difficult. + +#### Sharding based on PostID +A hash function maps each PostID to a random server where we can store that post. + +A centralized server running the offline feed generation service will: +1. Find all the people the user follows. +2. Send a query to all DB partitions to find posts from these people. +3. Each DB server will find posts for each user, sort them by recency and return top posts to the centralized server. + +The service will merge all results and sort them again to be stored in cache; ready to be retrieved whenever a user does a pull request. This solves the problem of hot users. + +Issues with this approach: +- We have to query all DB partitions to find posts for a user, leading to higher latencies. + +> We can improve the performance by caching hot posts in front of the DB servers. + +#### Sharding based on Post creation time + +Storing posts based on creation timestamp will help us to fetch all top posts quickly and we only have to query a very small set of database servers. + +Issues: +- Traffic load won't be distributed. e.g when writing, all new posts will be going to one server, and remaining servers sit idle. When reading, server holding the latest data will have high load as compared to servers holding old data. + +#### Sharding by PostID + Post creation time +Each PostID should be universally unique and contain a timestamp. +We can use epoch time for this. + +![](images/twitter_epoch.png) + +First part of PostID will be a timestamp, second part an auto-incrementing number. We can then figure out the shard number from this PostId and store it there. + +For fault tolerance and better performance, we can have two DB servers to generate auto-incrementing keys; one odd numbers and one even numbered keys. + +If we assume our current epoch seconds begins now, PostIDs will look like this: + + + +```python +epoch = 1571691220 +print('epoch - autoincrement') +for i in range(1,5): + print(f'{epoch}-{i:06}') +``` + + epoch - autoincrement + 1571691220-000001 + 1571691220-000002 + 1571691220-000003 + 1571691220-000004 diff --git a/designing_ticketmaster.md b/designing_ticketmaster.md index 119b762..8d88b7b 100644 --- a/designing_ticketmaster.md +++ b/designing_ticketmaster.md @@ -22,6 +22,7 @@ A movie ticket booking system provides its customer the ability to purchase thea - The system has financial transactions, meaning it should be secure and the DB should be ACID compliant. - Assume traffic will spike on popular/much-awaited movie releases and the seats would fill up pretty fast, so the service should be highly scalable and highly available to keep up with the surge in traffic. + ### Design Considerations 1. Assume that our service doesn't require authentication. 2. No handling of partial ticket orders. Either users get all the tickets they want or they get nothing. @@ -31,24 +32,27 @@ A movie ticket booking system provides its customer the ability to purchase thea ## 2. Capacity Estimation > **Traffic estimates:** 3 billion monthly page views, sells 10 million tickets a month. -> **Storage estimates:** 500 cities, on average each city has 10 cinemas, each with 300 seats, 3 shows daily. + +> **Storage estimates:** +500 cities, on average each city has 10 cinemas, each with 300 seats, 3 shows daily. Let's assume each seat booking needs 50 bytes (IDs, NumberOfSeats, ShowID, MovieID, SeatNumbers, SeatStatus, Timestamp, etc) to store in the DB. We need to store information about movies and cinemas; assume another 50 bytes. So to store all data about all shows of all cinemas of all cities for a day -```text +``` 500 cities * 10 cinemas * 300 seats * 3 shows * (50 + 50) bytes = 450 MB / day ``` To store data for 5 years, we'd need around -```text +``` 450 MB/day * 365 * 5 = 821.25 GB ``` ## 3. System APIs Let's use REST APIs to expose the functionality of our service. + ### Searching movies ```python search_movies( @@ -140,7 +144,7 @@ reserve_seats( ``` Returns: (JSON) -```text +``` The status of the reservation, which would be one of the following: 1. Reservation Successful, 2. Reservation Failed - Show Full @@ -156,6 +160,7 @@ Returns: (JSON)   + ![](images/e_ticketing_db_design.svg) ## 5. High Level Design @@ -178,11 +183,10 @@ Let's explore the workflow part where there are no seats available to reserve, b If seats are reserved successfully, the user has 5 minutes to pay for the reservation. After payment, booking is marked complete. If the user isn't able to pay within 5 minutes, all the reserved seats are freed from the reservation pool to become available to other users. -### How we'll keep track of all active reservations that have not been booked yet, and keep track of waiting customers +#### How do we keep track of all active reservations that haven't been booked yet? and also keep track of waiting customers? +We need two daemon services: -We need two daemon services for this: - -#### a. Active Reservation Service +**a. Active Reservation Service** This will keep track of all active reservations and remove expired ones from the system. @@ -193,14 +197,10 @@ We can keep all the reservations of a show in memory in a [Linked Hashmap](https To store every reservation for every show, we can have a HashTable where the `key` = `ShowID` and `value` = Linked HashMap containing `BookingID` and creation `Timestamp`. In the DB, -- We store reservation in the `Booking` table. - -- Expiry time will be in the Timestamp column. - -- The `Status` field will have a value of `Reserved(1)` and, as soon as a booking is complete, update the status to `Booked(2)`. - -- After status is changed, remove the reservation record from Linked HashMap of the relevant show. - +- we store reservation in the `Booking` table +- expiry time will be in the Timestamp column. +- The `Status` field will have a value of `Reserved(1)` and, as soon as a booking is complete, update the status to `Booked(2)` + - After status is changed, remove the reservation record from Linked HashMap of the relevant show. - When reservation expires, remove it from the Booking table or mark it `Expired(3)`, and remove it from memory as well. ActiveReservationService will work with the external Financial service to process user payments. When a booking is completed, or a reservation expires, WaitingUserService will get a signal so that any waiting customer can be served. @@ -219,9 +219,9 @@ hash_table = { } ``` -#### b. Waiting User Service +**b. Waiting User Service** -- This daemon service will keep track of waiting users in a Linked HashMap or TreeMap. +- This service will keep track of waiting users in a Linked HashMap or TreeMap. - To help us jump to any user in the list and remove them when they cancel the request. - Since it's a first-come-first-served basis, the head of the Linked HashMap would always point to the longest waiting user, so that whenever seats become available, we can serve users in a fair manner. @@ -235,21 +235,22 @@ On the server, the Active Reservation Service keeps track of expiry of active co On the client, we will show a timer (for expiration time), which could be a little out of sync with the server. We can add a buffer of 5 seconds on the server to prevent the client from ever timing out after the server, which, if left unchecked, could prevent successful purchase. -## 7. Concurrency + +# 7. Concurrency We need to handle concurrency, such that no 2 users are able to book the same seat. We can use transactions in SQL databases, isolating each transaction by locking the rows before we can update them. If we read rows, we'll get a write lock on the them so that they can't be updated by anyone else. Once the DB transaction is committed and successful, we can start tracking the reservation in the Active Reservation Service. -## 8. Fault Tolerance +# 8. Fault Tolerance If the two services crash, we can read all active reservations from the Booking table. Another option is to have a **master-slace configuration** so that, when the master crashes, the slave can take over. We are not storing the waiting users in the DB, so when Waiting User Service crashes, we don't have any means to recover the data unless we have a master-slave setup. We can also have the same master-slave setup for DBs to make them fault tolerant. -## 9. Data Partitioning +# 9. Data Partitioning Partitioning by MovieID will result in all Shows of a Movie being in a single server. For a hot movie, this could cause a lot of load on that server. A better approach would be to partition based on ShowID; this way, the load gets distributed among different servers. diff --git a/designing_twitter.md b/designing_twitter.md index 23a72de..77437ba 100644 --- a/designing_twitter.md +++ b/designing_twitter.md @@ -1,4 +1,3 @@ - # Designing Twitter Twitter is an online social networking service where users post and read short 140-character messages called "tweets". Registered Users can post and read tweets, but those not registered can only read them. @@ -7,24 +6,24 @@ Twitter is an online social networking service where users post and read short 1 ## 1. Requirements and System Goals ### Functional Requirements -1. Users should be able to post new tweets. -2. A user should be able to follow other users. -3. Users should be able to mark tweets as favorites. -4. Tweets can contain photos and videos. -5. A user should have a timeline consting of top tweets from all the people the user follows. +- Users should be able to post new tweets. +- A user should be able to follow other users. +- Users should be able to mark tweets as favorites. +- Tweets can contain photos and videos. +- A user should have a timeline consting of top tweets from all the people the user follows. ### Non-functional Requirements -1. Our service needs to be highly available. -2. Acceptance latency of the sytstem is 200ms for timeline generation. -3. Consistency can take a hit (in the interest of availability); if user doesn't see a tweet for a while, it should be fine. +- Our service needs to be highly available. +- Acceptance latency of the sytstem is 200ms for timeline generation. +- Consistency can take a hit (in the interest of availability); if user doesn't see a tweet for a while, it should be fine. ### Extended Requirements -1. Searching tweets. -2. Replying to a tweet. -3. Trending topics - current hot topics. -4. Tagging other users. -5. Tweet notification. -6. Suggestions on who to follow. +- Searching tweets. +- Replying to a tweet. +- Trending topics - current hot topics. +- Tagging other users. +- Tweet notification. +- Suggestions on who to follow. ## Capacity Estimation and Constraints @@ -110,7 +109,7 @@ We need to store data about users, their tweets, their favorite tweets, and peop ![](images/twitter_db_schema.svg) -For choosing between SQL or NoSQL, check out [Designing Instagram](designing_instagram.md) +For choosing between SQL or NoSQL, check out Designing Instagram from the README. ## 6. Data Sharding @@ -123,8 +122,8 @@ We can try storing a user's data on one server. While storing: - While querying for their data, we can ask the hash function where to find it and read it from there. Issues: -1. What if a user becomes hot? There will be lots of queries on the server holding that user. This high load will affect the service's performance. -2. Over time, some users will have more data compared to others. Maintaining a uniform distribution of growing data is quite difficult. +- What if a user becomes hot? There will be lots of queries on the server holding that user. This high load will affect the service's performance. +- Over time, some users will have more data compared to others. Maintaining a uniform distribution of growing data is quite difficult. #### Sharding based on TweetID - Hash function maps each TweetID to a random server where we store that tweet. @@ -146,10 +145,10 @@ Issues with this approach: #### Sharding based on Tweet creation time -Storing tweets based on creation timestamp will give help us to fetch all top tweets quickly and we only have to query a very small set of servers. +Storing tweets based on creation timestamp will help us to fetch all top tweets quickly and we only have to query a very small set of database servers. Issues: -- Traffic load won't be distributed. e.g when writing, all new tweets will be going to one server, and remaining servers sit idle. When reading, server holding the latest data will have high load as compared to servers holding old data. +- Traffic load won't be distributed. e.g when writing, all new tweets will be going to one DB server, and remaining DB servers sit idle. When reading, a database server holding the latest data will have high load as compared to servers holding old data. #### Sharding by TweetID + Tweet creation time Each tweetID should be universally unique and contain a timestamp. @@ -158,7 +157,7 @@ We can use epoch time for this. First part of TweetID will be a timestamp, second part an auto-incrementing number. We can then figure out the shard number from this TweetId and store it there. What could be the size of our TweetID? -If our epoch time started today, the number of bits we need to store the numnber of seconds for the next 50 years: +If our epoch time started today, the number of bits we need to store the number of seconds for the next 50 years: ``` Number of seconds for the next 50 years: @@ -180,15 +179,13 @@ Since on average we expect 1150 new tweets every second i.e we can allocate 17 bits to store auto incremented sequence. This makes our tweetID 48 bits long. Every second, we can store 2^17(130k) new tweets. -We can reset our auto incrementing sequence every second. For fault tolerance and better performance, we can have two DB serves to generate auto-incrementing keys; one odd numbers and one even numbered keys. +We can reset our auto incrementing sequence every second. For fault tolerance and better performance, we can have two DB servers to generate auto-incrementing keys; one odd numbers and one even numbered keys. If we assume our current epoch seconds begins now, TweetIDs will look like this: ```python -from datetime import datetime -epoch = int(datetime.now().timestamp()) - +epoch = 1571691220 print('epoch - autoincrement') for i in range(1,5): print(f'{epoch}-{i:06}') @@ -243,12 +240,12 @@ We can add Load balancing layer at 3 places: 2. Between app servers and DB replication servers 3. Between Aggregation servers and Cache servers -We can adopt a simple round robin approach to distribure incoming requests equally among servers. +We can adopt a simple round robin approach to distribute incoming requests equally among servers. Benefits of this LB approach: - Simple to implement with no overhead -- If a server is dead, LB will take it our for the rotation and will stop sending any traffic to it. +- If a server is dead, LB will take it out from the rotation and will stop sending any traffic to it. -Problem with Round Robin is that it doesn't know if a server is overloaded with requests or slow. It won't stop sending requests to that server. To fix this, a moe intelligent LB solution can be placed that periodically queries backend server about their load and adjusts traffic to it based on that. +> Problem with Round Robin is that it doesn't know if a server is overloaded with requests or if it's slow. It won't stop sending requests to that server. To fix this, the LB can periodically query the backend server about its load and adjusts traffic to it based on that. ## 9. Extended Requirements diff --git a/designing_twitter_search.md b/designing_twitter_search.md new file mode 100644 index 0000000..333e144 --- /dev/null +++ b/designing_twitter_search.md @@ -0,0 +1,149 @@ +# Designing Twitter Search + +We'll design a service that can effectively store and query user tweets. + +## 1. Requirements and System Goals +- Assume Twitter has 1.5 billion total users with 800 million daily active users. +- On average Twitter gets 400 million tweets every day. +- Average size of a tweet is 300 bytes. +- Assume 500M searches a day. +- The search query will consist of multiple words combined with AND/OR. + + +## 2. Capacity Estimation and Constraints + +``` + 400 million new tweets each day, + Each tweet is on average 300 bytes + 400M * 300 => 120GB/day + + Total storage per second: + 120 GB / (24 hours / 3600 seconds) ~= 1.38MB/second +``` + + +## 3. System APIs +We can have REST APIs expose the functionality of our service. + +```python + +search( + api_dev_key: string, # The API developer key of a registered account, this will be used for things like throttling users based on their allocated quota. + search_terms: string, # A string containing the search terms. + max_results_to_return: number, # Number of tweets to return. + sort: number, # optional sort mode: Last first(0 - default), Best mached (1), Most liked (2) + page_token: string, # This token specifies a page in the result set that should be returned. +) +``` +Returns: (JSON) +``` +A JSON containing info about a list of tweets matching the search query. +Each result entry can have the user ID & name, tweet text, tweet ID, creation time, number of likes, etc. +``` + + + +## 4. Detailed Component Design +1. Since we have a huge amount of data, we need to have a data partitioning scheme that'll efficiently distribute the data across multiple servers. + + +5 year plan +``` + 120 GB/day * 365 days * 5 years ~= 200TB + +``` + +We never want to be more than 80% full at any time, so we'll need approximately 250TB storage. Assuming we also need to keep an extra copy for fault tolerance, then, our total storage will be 500 TB. + +Assuming modern servers store up to 5TB of data, we'd need 100 such servers to hold all the data for the next 5 years. + +Let's start with simplistic design where we store tweets in a PostgreSQL DB. Assume a table with two columns: TweetID, and TweetText. +Partitioning can be based on TweetID. If our TweetIDs are unique system wide, we can define a hash function that can map a TweetID to a storage server where we can store that tweet object. + +#### How can we create system wide unique TweetIDs? +If we're getting 400M tweets per day, then in the next five years? +``` + 400 M * 365 * 5 years => 730 billion tweets +``` +We'll need 5 bytes number to identify TweetIDs uniquely. Assume we have a service that will generate unique TweetIDs whenever we need to store an object. We can feed the TweetID to our hash function to find the storage server and store the tweet object there. + +2. **Index:** Since our tweet queries will consist of words, let's build the index that can tell us which words comes in which tweet object. + + +Assume: +- Index all English words, +- Add some famous nouns like People names, city names, etc +- We have 300K English words, 200K nouns, Total 500K. +- Average length of a word = 5 characters. + +``` + If we keep our index in memory, we need: + + 500K * 5 => 2.5 MB +``` + +Assume: + - We keep the index in memory for all tweets from our last two years. +``` + Since we'll get 730 Billion tweets in the next 5 years, + + 292Billion (2 year tweets) * 5 => 1460 GB +``` + +So our index would be like a big distributed hash table, where 'key' would be the word and 'value' will be a list of TweetIDs of all those tweets which contain that word. + +Assume: + - Average of 40 words in each tweet, + - 15 words will need indexing in each tweet, since we won't be indexing prepositions and other small words (the, in, an, and) + +> This means that each TweetID will be stored 15 times in our index. + +so total memory we will need to store our index: +``` + (1460 * 15) + 2.5MB ~= 21 TB +``` +> Assuming a high-end server holds 144GB of memory, we would need 152 such servers to hold our index. + +## Sharding our Data + + +#### Sharding based on Words: +While building the index, we'll iterate through all words of a tweet and calculate the hash of each word to find the server where it would be indexed. To find all tweets containing a specific word we have to query only server which contains this word. + +Issues with this approach: +- If a word becomes hot? There will be lots of queries (high load) on the server holding the word, affecting the service performance. +- Over time, some words can end up storing a lot of TweetIDs compared to others, therefore maintaining a uniform distribution of words while tweets are growing is tricky. + +To recover from this, we can repartition our data or use [Consistent Hashing](https://en.wikipedia.org/wiki/Consistent_hashing) + + +#### Sharding based on tweet object +While storing, we will pass the TweetID to our hash function to find the server and index all words of the tweet on that server. +While querying for a particular word, we'll query all servers, and each server will return a set of TweetIDs. A centralized server will aggregate these results to return them to the user. + +![](images/sharding_based_on_tweet_object.png) + +## 6. Fault Tolerance +We can have a secondary replica of each server and if the primary one dies, it can take control after the failover. +Both primary and secondary servers will have the same copy of the index. + +How can we efficiently retrieve a mapping between tweets and the index server? We have to build a reverse index that will map all the tweetID to their index server. We'll keep this in the Index-Builder server. + +- build a Hashtable, where key = index server number and value = HashSet containing all TweetIDs being kept at that index server. +- A HashSet will help us to add/remove tweets from our index quickly. + +So whenever an index server has to rebuild itself, it can simply ask the Index-Builder server for all tweets it needs to store and then fetch those tweets to build the index. We should also have a replica of the Index-builder server for fault tolerance. + +## 7. Caching +We can introduce a cache server in front of our DB. We can also use Memcached, which can store all hot tweets in memory. App servers before hitting the backend DB, can quickly check if the cache has that tweet. Based on clients' usage patterns, we can adjust how many cache servers we need. For cache eviction policy, Least Recently Used (LRU) seems suitable. + +## 8. Load Balancing +Add LB layers at two places: +1. Between Clients and Application servers, +2. Between Application servers and Backend server. + +LB approach: +- Use round robin approach: distrubute incoming requests equally among servers. +- Simple to implement and no overhead +- If as server is dead, LB will take it out of rotation and stop sending traffic to it +- Problem is if a server is overloaded, or slow, the LB will not stop sending new requests to it. To fix this, a more intelligent LB solution can be placed that periodically queries the server about the load and adjust traffic based on that. diff --git a/designing_typeahead_suggestion.md b/designing_typeahead_suggestion.md index eb59fae..b8576cf 100644 --- a/designing_typeahead_suggestion.md +++ b/designing_typeahead_suggestion.md @@ -1,4 +1,3 @@ - # Designing Typeahead Suggestion Typeahead is a real-time suggestion service which recommends terms to users as they enter text for searching. diff --git a/designing_uber_backend.md b/designing_uber_backend.md index 7388f2c..b7c7445 100644 --- a/designing_uber_backend.md +++ b/designing_uber_backend.md @@ -1,4 +1,3 @@ - # Designing Uber Backend Let's design a ride-sharing service like Uber, connecting a passenger who needs a ride with a driver who has a car. @@ -102,8 +101,8 @@ To receive location updates from all active drivers, we get DriverID and their l The memory and bandwidth requirements can be easily handled by one server, but for scalability, performance, and fault tolerance, we should distribute DriverLocationHT onto multiple servers. We can distribute beased the DriverID to make the distribution completely random. Let's call the machines holding DriverlocationHt the Driver location servers. The servers will: -1. As soon as they receive driver location update, broadcast that information to all interested customers. -2. Notify the respective QuadTree server to refresh the driver's location. This happens every 15 seconds. +- As soon as they receive driver location update, broadcast that information to all interested customers. +- Notify the respective QuadTree server to refresh the driver's location. This happens every 15 seconds. ### Broadcasting driver's location to customers @@ -167,9 +166,3 @@ We can rank search results not just by proximity but also by popularity or relev Let's assume we keep track of the overall ratings in our database and QuadTree. An aggregated number can represent this popularirt in our system. For example, while searching for the top 10 drivers within a given radius, we can ask each partition of QuadTree to return the top 10 drivers with a maximum rating. The aggregator server can then determine the top 10 drivers among all drivers returned. - - - -```python - -``` diff --git a/designing_webcrawler.md b/designing_webcrawler.md new file mode 100644 index 0000000..c648c98 --- /dev/null +++ b/designing_webcrawler.md @@ -0,0 +1,191 @@ +# Designing a Web Crawler + +Let's design a Web Crawler that will browse and download the World Wide Web. + +## What's a Web Crawler? +It's a software program which browses the WWW in a methodical and automated manner, collecting documents by recursively fetching links from a set of starting pages. + +Search engines use web crawling as a means to provide up-to-date data. Search engines download all pages and create an index on them to perform faster searches. + +Other uses of web crawlers: +- Test web pages and links for valid syntax and structure. +- To search for copyright infringements. +- To maintain mirror sites for popular web sites. +- To monitor sites to see where their content changes. + +## 1. Requirements and Goals of the System +**Scalability:** Our service needs to be scalable, since we'll be fetching hundreds of millions of web documents. + +**Extensibility:** Our service should be designed in a way that allows newer functionality to be added to it. It should be able to allow for newer document formats that needs to be downloaded and processed in future. + +## 2. Design Considerations +We should be asking a few questions here: + +#### Is it a crawler for only HTML pages? Or should we fetch and store other media types like images, videos, etc. +It's important to clarify this because it will change the design. If we're writing a general-purpose crawler, we might want to break down the parsing module into different sets of modules: one for HTML, another for videos,..., so basically each module handling a given media type. + +For this design, let's assume our web crawler will deal with HTML only. + +#### What protocols apart from HTTP are we looking at? FTP? +Let's assume HTTP for now. Again, it should not be hard to extend it to other protocols. + +#### What is the expected number of pages we will crawl? How big will be the URL Database? +Let's assume we need to crawl 1Billion websites. Since one site can contain many URLs, assume an upper bound of `15 billion web pages`. + + +#### Robots Exclusion Protocol? +Some web crawlers implement the Robots Exclusion Protocol, which allows Webmasters to declare parts of their sites off limits to crawlers. The Robots Exclusion Protocol requires a Web Crawler to fetch a document called `robot.txt` which contains these declarations for that site before downloading any real content from it. + + +## 3. Capacity Estimation and Constraints +If we crawl 15B pages in 4 weeks, how many pages will we need to fetch per second? + +```text + 15B / (4 weeks * 7 days * 86400 sec) ~= 6200 web pages/sec +``` + +**What about storage?** Pages sizes vary. But since we are dealing with HTML only, let's assume an average page size is 100KB. With each page, if we're storing 500 bytes of metadata, total storage we would need is: + +```text + 15B * (100KB + 500 bytes) + 15 B * 100.5 KB ~= 1.5 Petabytes +``` + +We don't want to go beyond 70% capacity of our storage system, so the total storage we will need is: + +```text + 1.5 petabytes / 0.7 ==> 2.14 Petabytes +``` + +## 4. High Level Design +The basic algorithm of a web crawler is this: + +1. Taking in a list of seed URLs as input, pick a URL from the unvisited URL list. +2. Find the URL host-name's IP address. +3. Establish a connection to the host to download its corresponding documents. +4. Parse the documents contents to look for new URLs. +5. Add the new URLs to the list of unvisited URLs. +6. Process the downloaded document, e.g, store it, or index the contents +7. Go back to step 1. + +### How to Crawl + +Breath first or depth first? +Breadth-first search (BFS) is usually used. We can also use Depth-first search especially when the crawler has already established a connection with a website. In this situation, the crawler will just DFS all the URLs within the website to save some handshaking overhead. + +**Path-ascending crawling:** Path-ascending crawling helps discover a hidden or isolated resources. In this scheme, a crawler would ascend to every path in each URL like so: +```text + given a seed URL of http://xyz.com/a/b/one.html + + it will attempt to crawl /a/b/, /a/ and / +``` + +### Difficulties implementing an efficient web crawler. +#### 1. Large volume of web pages +A large volume implies that the web crawler can only dowload a fraction of the web pages, so it's critical that the web crawler should be intelligent enough to prioritize download. + +#### 2. Rate of change on web pages +Web pages change frequenty. By the time the crawler is downloading the last page from the site, the page may change dynamically, or a new page may be added. + +**Components of a bare minimum crawler:** +1. **URL frontier:** stores a list of URLs to download and prioritize which URLs should be crawled first. +2. **HTTP Fetcher:** to retrieve a web page from the hosts server. +3. **Extractor:** to extract links from HTML documents. +4. **Duplicate Remover:** to make sure same content is not extracted twice. +5. **Datastore:** to store retrieved pages, URLs and other metadata. + +![](images/designing_webcrawler_high_level.png) + +## 5. Detailed Component Design +Assume the crawler is running on a single server, where multiple working threads are performing all the steps needed to download and process a document in a loop. + +**Step 1:** remove an absolute URL from the shared URL frontier for downloading. the URL begins with a scheme (e.g HTTP) which identifies the network protocol that should be used to download it. +We can implement these protocols in a modular way for extensibility, so that later if our crawler needs to support more protocols, it can easily be done. + +**Step 2:** Based on the URL's scheme, the worker calls the appropriate protocol module to download the document. + +**Step 3:** After downloading, the document is written into a Document Input Stream (DIS). This will enable other modules to re-read the document multiple times. + +**Step 4:** The worker invokes the dedupe test to see whether this document (associated with a different URL) has already been seen before. If so, the document is not processed any further and the worker thread removes the next URL from the frontier. + +**Step 5:** Process the downloaded document. Each doc has a different MIME type like HTML page, Image, Video etc. We can implement these MIME schemes in a modular way, to allow for extensibility when our crawler need to support more types. The worker invokes the process method of each processing module with that MIME type. + +**Step 6:** The HTML processing module will extract links from the page. Each link is converted into an absolute URL and testsed against a user-supplied filter to determine if it should be downloaded. If the URL passes the filter, the worker performs the URL-dedupe test, which checks if the URL has been downloaded before. If it's new, it is added into the URL frontier. + +![](images/designing_crawler_detailed_component.png) + +Let's see how each component can be distributed onto multiple machines: + +#### a. The URL frontier +This is the data structure that contains all the URLs that are queued to be downloaded. We crawl by performing a breadth-first traversal of the Web, starting from the pages in the seed set. We can use a FIFO queue to implement this. + +Since we have a huge list of URLs to crawl, we can distribute our URL frontier into multiple servers. Let's assume on each server we have multiple threads performing the crawling tasks. Our hash function maps each URL to the server responsible for crawling it. + +Constraints for a distributed URL frontier: +- The crawler should not overload a server by downloading a lot of pages. +- Multiple machines should not connect to a single web server. + +> For each server, we can have distinct FIFO sub-queues, where each worker thread will remove URLs for crawling. + +Once a new URL is added, we determine which sub-queue it belongs to by using the URL's canonical hostname. Our hash function will map each **hostname** to a **thread number**. Together, these two points imply, that only one worker thread will download documents from a given web server and also, by using FIFO queue, it'll not overload a web server. + +##### How big will our URL frontier be? +The size would be in the 100s of millions of URLs. Therefore, we need to store the URLs on disk. We can implement our queues in such a way that they have separate buffers for enqueuing and dequeuing. Enqueuing buffer, once filled, will be dumped to the disk. Dequeuing buffers will keep a cache of URLs that need to be visited; periodically reading from the disk to fill the buffer. + +#### b. The Fetcher Module +This will download the document corresponding to a given URL using the appropriate network protocol like HTTP. Webmasters create a `robot.txt` to make certain parts of the websites off limits for the crawler. +To avoid downloading this text file on every request, our HTTP protocol module can maintain a cache mapping host-names to their robot's exclusion rules. + +#### c. Document input steam +We cache the document locally using DIS to avoid downloading the document multiple times. + +A DIS is an input stream that caches the doc's entire contents in memory. It also provides methods to re-read the document. Larger documents can be temporarily written to a backing file. + +Each worker will have a DIS, which it reuses from document to document. After extracting a URL from the frontier, the worker passes that URL to the relevant protocol module (in our case, for HTTP) which initializes the DIS from a network connection to contain the document's contents. The worker then passes the DIS to all relevant processing modules. + +#### d. Document Dedupe test +To prevent processing a doc more than once, we perform a dedupe test on each doc to remove duplication. + +We can calculate a 64-bit checksum (using MD5 or SHA) on every processed document and store it in a database. For each new document, we compare its checksum to all previously calculated checksums to see if the doc has been seen before. + +##### How big will be the checksum store +We need to keep a unique set containing checksums of all previously process documents. Considering 15 billion distinct web pages, we would need about +```text + 15B * 8 bytes => 120 GB +``` +We can have a small LRU cache on each server with everything backed by persistent storage. + +Steps: +- Check if the checksum is present in the cache. +- If not, check if the checksum is in the back storage. +- If found, ignore the document. +- Otherwise, add the checksum to the cache and back storage. + +#### e. URL filters +URL filtering mechanism provides a customizable way to control the set of URLs that are downloaded. We can define filters to restrict URLs by domain, prefix or protocol type. + +Before adding the URL to the frontier, the worker thread consults the user-supplied URL filter. + +#### f. Domain name resolution +Before we contact a web server, a Web crawler must use a DNS to map the Web server's hostname into an IP address. DNS name resolution will be a big bottleneck of our crawlers given the amount of URLs we are working with. To avoid repeated requests, we can start caching DNS results by building our local DNS server. + +#### g. URL dedupe test +While extracting links, we will encounter multiple links to the same document. To avoid downloading and processing a doc multiple times, we use a URL dedupe test on each extracted link before adding it to the URL frontier. + +We can store a fixed-size checksums for all the URLs seen by our crawler in a database. To reduce number of operations performed on the DB, we can keep an in-memory cache of popular URLs on each host shared by all threads. This is because links to some URLs are quite common, so caching will lead to a high in-memory hit rate. + +To keep a unique set of checksums of all previously seen URLS, we would need: +```text + 15 Billion URLS * 4 bytes => 60 GB +``` + +#### h. Checkpointing +A crawl of the entire Web takes weeks to complete. To guard against failures, our crawler can write regular snapshots of its state to the disk. An interrupted on aborted crawl can easily be restarted from the latest checkpoint. + + +## 6. Fault Tolerance +We should use consistent hashing for distribution among crawler servers. Consistent hashing will help in: +- replacing dead hosts +- distributing load among crawling servers + +All crawling servers will perform regular checkpointing and storing their FIFO queues to disks. If a server goes down, we can replace it. During the replacement, consistent hashing should shift the load to other load-capable servers. diff --git a/designing_youtube_or_netflix.md b/designing_youtube_or_netflix.md index 531e591..06044c5 100644 --- a/designing_youtube_or_netflix.md +++ b/designing_youtube_or_netflix.md @@ -1,4 +1,3 @@ - # Designing Youtube or Netflix Let's design a video sharing service like Youtube where users will be able to upload/view/search videos. @@ -7,11 +6,11 @@ Similar Services: Netflix, Vimeo ## 1. Requirements and Goals of the System #### Functional Requirements: -1. Users should be able to upload videos. -2. Users should have ability to share and view videos. -3. Users should be able to perform searches based on video titles. -4. Service should have starts on videos, e.g likes/dislikes, no. of views, etc. -5. Users should be able to add and view comments on videos +- Users should be able to upload videos. +- Users should have ability to share and view videos. +- Users should be able to perform searches based on video titles. +- Service should have starts on videos, e.g likes/dislikes, no. of views, etc. +- Users should be able to add and view comments on videos #### Non-Functional Requirements: @@ -109,12 +108,12 @@ A media stream (video chunk) from the given offset. ## 4. High Level Design At a high-level we would need the following components: -1. **Processing Queue:**: Each uploaded video will be pushed to a processing queue ot be de-queued later for encoding, thumbnail generation, and storage. -2. **Encoder:** To encode each uploaded video into multiple formats. -3. **Thumbnails generator:** To generate thumbnails for each video. -4. **Video and Thumbnail storage:** To store video and thumbnail files in some distributed file storage. -5. **User DB:** To store user's info e.g name, email, address, etc. -6. **Video metadata storage:** A metadata DB to store information about videos like title, its file path, uploading user, total views, likes, comments etc. +- **Processing Queue:**: Each uploaded video will be pushed to a processing queue ot be de-queued later for encoding, thumbnail generation, and storage. +- **Encoder:** To encode each uploaded video into multiple formats. +- **Thumbnails generator:** To generate thumbnails for each video. +- **Video and Thumbnail storage:** To store video and thumbnail files in some distributed file storage. +- **User DB:** To store user's info e.g name, email, address, etc. +- **Video metadata storage:** A metadata DB to store information about videos like title, its file path, uploading user, total views, likes, comments etc. ![](images/hld_youtube.png) @@ -139,7 +138,7 @@ For each video comment, we nneed to store: - CreatedAt #### User data storage - MySQL -* UserID, Name, email, address, age, registration details etc +- UserID, Name, email, address, age, registration details etc ## 6. Detailed Component Design The service will be read-heavy, since more people are viewing than uploading videos. We'll focus on building a system that can retrieve videos quickly. We can expect a read:write ratio of 200:1. @@ -154,8 +153,8 @@ For metadata, we can have a master-slave config where writes go to master first #### Where would thumbnails be stored? There will be a lot more thumbnails than videos. Assume each video has 5 thumbnails, we need to have a very efficient storage system that'll serve huge read traffic. Two considerations: -1. Thumbnails are small files, max 5KB each. -2. Read traffic for thumbnails will be huge compared to videos. Users will be watching one video at a time, but they might be looking at a page that has 20 thumbnails of other videos. +- Thumbnails are small files, max 5KB each. +- Read traffic for thumbnails will be huge compared to videos. Users will be watching one video at a time, but they might be looking at a page that has 20 thumbnails of other videos. Let's evaluate storing thumbnails on disk. Given the huge number of files, we have to perform a lot of seeks to different locations on the disk to read these files. This is quite inefficient and will result in higher latencies. @@ -197,11 +196,11 @@ We can further improve our performance by introducing a cache to store hot video With a huge number of users uploading massive amounts of video data, our service will have to deal with widspread video duplication. Duplicate videos often differ in aspect ratios or encodings, can contain overlays or additional borders, or can be excerpts from a longer original video. Having duplicate videos can have the following impact on many levels: -1. Data Storage: we'd waste storage by keeping multiple copies of the same video. -2. Caching: They'll degrade cache efficiency by taking up space tht could be used for unique content. -3. Network usage: They'll increase data sent over the network to in-network caching systems. -4. Energy consumption: Higher storage, inefficient cache and high network usage could result in energy wastage. -5. Effect to our user: Duplicate search results, longer video startup times, and interrupted streaming. +- Data Storage: we'd waste storage by keeping multiple copies of the same video. +- Caching: They'll degrade cache efficiency by taking up space tht could be used for unique content. +- Network usage: They'll increase data sent over the network to in-network caching systems. +- Energy consumption: Higher storage, inefficient cache and high network usage could result in energy wastage. +- Effect to our user: Duplicate search results, longer video startup times, and interrupted streaming. #### How do we implement deduplication? Deduplication should happen when a user is uploading a video as compared to post-processing it to find videos later. Inline deduplication will save us a lot of resources that could be used to encode, transfer, and store the duplicate copy of the video. As soon as any user starts uploading a vidoe, our service can run video matching algorithms to find duplications. Such algorithms include: diff --git a/distributed_logging.md b/distributed_logging.md new file mode 100644 index 0000000..6960beb --- /dev/null +++ b/distributed_logging.md @@ -0,0 +1,120 @@ +# Designing Distributed Logging System + +One of the most challenging aspects of debugging distributed systems is understanding system behavior in the period leading up to a bug. +As we all know by now, a distributed system is made up of microservices calling each other to complete an operation. +Multiple services can talk to each other to complete a single business requirement. + +In this architecture, logs are accumulated in each machine running the microservice. A single microservice can also be deployed to hundreds of nodes. In an archirectural setup where multiple microservices are interdependent, and failure of one service can result in failures of other services. If we do not have well organized logging, we might not be able to determine the root cause of failure. + +## Understanding the system +### Restrain Log Size +At any given time, the distributed system logs hundreds of concurrent messages. +The number of logs increases over time. But, not all logs are important enough to be logged. +To solve this, logs have to be structured. We need to decide what to log into the system on the application or logging level. + +### Log sampling +Storage and processing resources is a constraint. We must determine which messages we should log into the system so as to control volume of logs generated. + +High-throughput systems will emit lots of messages from the same set of events. Instead of logging all the messages, we can use a sampler service that only logs a smaller set of messages from a larger chunk. The sampler service can use various sampling algorithms such as adaptive and priority sampling to log events. For large systems with thousands of microservices and billions of events per seconds, an appropriate + +### Structured logging +The first benefit of structured logs is better interoperability between log readers and writers. +Use structured logging to make the job of log processing system easier. + +### Categorization +The following severity levels are commonly used in logging: +- `DEBUG` +- `INFO` +- `WARNING` +- `ERROR` +- `CRITICAL` + +## Requirements +### Functional requirements +- Writing logs: the microservices should be able to write into the logging system. +- Query-based logs: It should be effortless to search for logs. +- The logs should reside in distributed storage for easy access. +- The logging mechanism should be secure and not vulnerable. Access to logs should be for authenticated users and necessary read-only permissions granted to everyone. +- The system should avoid logging sensitive information like credit cards numbers, passwords, and so on. +- Since logging is a I/O-heavy operation, the system should avoid logging excessive information. Logging all information is unnecessary. It only takes up more space and impacts performance. +- Avoid logging personally identifiable information (PII) such as names, addresses, emails, etc. + + +### Non-functional requirements +- **Low latency:** logging is a resource-intensive operation that's significantly slower than CPU operations. To ensure low latency, the logging system should be designed so that logging does not block or delay a service's main application process. +- **Scalability:** Our logging system must be scalable. It should efficiently handle increasing log volumes over time and support a growing number of concurrent users. +- **Availability:** The logging system should be highly available to ensure data is consistently logged without interruption. + +## Components to use +We will use the following components: +- **Pub-Sub system:** we will use a publish-subscribe system to efficiently handle the large volume of logs. +- **Distributed Search:** we will employ distributed search to query logs efficiently. + +>A distributed search system is a search architecture that efficiently handles large dataset and high query loads by spreading search operations across multiple servers or nodes. It has the following components: +>1. **Crawler:** This component fetches the content and creates documents. +>2. **Indexer:** Builds a searchable index from the fetched documents. +>3. **Searcher:** Responds to user queries by running searches on the index created by the indexer. + +- **Logging Accumulator:** This component will collect logs from each node and store them in a central location, allowing for easy retrieval of logs related to specific events without needing to query each individual node. +- **Blob Storage:** The blob storage provides a scalable and reliable storage for large volumes of data. +- **Log Indexer:** Due to the increasing number of log files, efficient searching is crucial. The log indexer utilizes distributed search techniques to index and make logs searchable, ensuring fast retrieval of relevant information. +- **Visualizer:** The visualizer component provides a unified view of all logs. It enables users to analyze and monitor system behavior and performance through visual representation and analytics. + + +## API Design +We design for reads and writes + + +Read +```python +searching(keyword) +``` +This API call returns a list of logs that contain the keyword. + +Write +```python +write_logs(unique_ID, message) +``` +This API call writes the log message against against a unique key. + + +## High Level System Design + +![](images/distributed_logging_design.png) + +## Component Design + +### Logging at Various Levels in a Server +In a server environment, logging occurs across various services and application, each producing logs crucial for monitoring and troubleshooting. + +#### Server Level +- **Multiple Applications:** A server hosts multiple apps, such as App1, App2, etc. Each running various microservices with user authentication, fetching the cart, storage etc in an e-commerce context. +- **Logging Structure:** each service within the application generates logs identified by an ID conprising application ID, service ID, and timestamp, ensuring unique identification and event causality determination. + +#### Logging Process +Each service will push logs into the log accumulator service. +The service will store the logs logically and push the logs to a pub-sub system. + +We use the pub-sub system to handle scalability challenge by efficiently managing and distributing a large volume of logs across the system. + +#### Ensuring Low Latency and Performance +- **Asynchronous Logging:** Logs are sent asynchronously via low-priority threads to avoid impacting the performance of critical processes. This also ensure continuous availability of services without any disruptions caused by logging activities. +- **Data Loss Awareness:** Logging large volumes of messsages can lead to potential data loss. To balance user-perceived latency with data peristence, log services often use RAM and save data asynchronously. To minimize data loss, we will add more log accumulators to handle increasing concurrent users. + +#### Log Retention +Logs also have an expiration date. We can delete regular logs after a few days or months. Comliance logs are usually stored for up to five years. If depends on the requirements of the application. + +Another crucial component therefore is to have an expiration checker. It will verity the logs that have to be deleted + +### Data Center Level +All servers in the data center transmit logs to a publish-subscribe architecture. +By utilizing a horizontally-scalable pub-sub framework, we can effectively manager large log volumes. + +Implementing multiple pub-sub instance within each data center enhances scalability and prevents throughput limitations and bottlenecks. Subsequently, the pub-sub system routes the log data to blob storage. + +![](images/distributed_logging_datacenter_level.png) + +Now, data in the pub-sub system is temporary and get deleted after a few days before being moved to archival storage. +However, while the data is still present in the pub-sub system, we can utilize it using the following services: +- **Alerts Service:** This service identifies alerts and errors and notifies the appropriate stakeholders if a critical error is detected, or sends a message to a monitoring tool, ensuring timely awareness of important alerts. The service will also monitor logs for suspicious activities or security incidents, triggering alerts or automated responses to mitigate threats. +- **Analytics service:** This service analyzes trends and patterns in the logged data to provide insights into system perf, user behavior, or operational metrics. diff --git a/executed_designing_api_rate_limiter.ipynb b/executed_designing_api_rate_limiter.ipynb new file mode 100644 index 0000000..c2ca1e9 --- /dev/null +++ b/executed_designing_api_rate_limiter.ipynb @@ -0,0 +1,357 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " # Designing an API Rate Limiter\n", + "\n", + "An API reate limiter will throttle users based on the number of requests they are sending.\n", + "\n", + "## Why Rate Limiting?\n", + "Rate limiting is all about traffic/load management. Clients can misbehave and trigger a retry storm, others might request the same information over and over again, increasing traffic and unnecessary load on the system. This can degrade the performance of the service, and the ability of the system to reliably handle incoming requests. \n", + "\n", + "Rate limiting helps to protect services against abusive behaviors targeting the application such as retry storms, Denial-of-service (DOS) attacks, brute-force password attempts, brute-force credit card transactions, etc.\n", + "\n", + "Rate limiting saves the company money by eliminating service and infrastructure costs that would have otherwise been used to handle spamming.\n", + "\n", + "Here are some scenarios to show the importance of Rate limiting our API/Services:\n", + "\n", + "- **Prevents service degradation:** It reduces traffic spikes so that the service stays reliable for all users.\n", + "\n", + "- **Misbehaving clients:** Sometimes, clients can overwhelm servers by sending large number of requests, either intentionally or unintentionally. \n", + "\n", + "- **Security:** Limiting the number of times a user should authenticate with a wrong password.\n", + "\n", + "- **Preventing abusive and bad design practices:** Without API limits, developers of client apps might request the same info over and over again.\n", + "\n", + "- **Revenue:** Services can limit operations based on the tier of their customer's service and thus create a revenue model off the rate limiting. To go beyond the set limit, the user has to buy higher limits. \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Requirements and System Goals\n", + "\n", + "#### Functional requirements\n", + "1. Limit the number of requests an entity can send to an API within a time window.\n", + "2. The user should get an error whenever they cross the defined threshold within a single server or across a set of servers.\n", + "\n", + "#### Non-Functional requirements\n", + "1. The system should be highly available, always protecting our API service from external attacks.\n", + "2. The rate limiter should NOT introduce substantial latencies affecting the user experience." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Throttling Types\n", + "\n", + "* ***Hard Throttling*** – Number of API requests cannot exceed the throttle limit.\n", + "\n", + "* ***Soft Throttling*** – Set the API request limit to exceed by some percentage. E.g, if the rate-limit = 100 messages/minute, and 10% exceed-limit, our rate limiter will allow up to 110 messages per minute.\n", + "\n", + "* ***Dynamic Throttling (Priority throttling)*** – The number of requests can exceed the limit if the system has some free resources available. The system can progressively throttle requests based on some predefined priority. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Algorithms used for Rate Limiting\n", + "\n", + "#### Fixed Window Algorithm\n", + "Here, the time window is considered from the start of the time-unit to the end of the time-unit. \n", + "For instance, a period will be considered 0-60 sec for a minute regardless of the time frame at which the API request has been made.\n", + "\n", + "The diagram below shows that we will only throttle 'm5' message, if our rate limit is 2 messages per second.\n", + "\n", + "![](images/fixed_rolling_window.svg)\n", + "\n", + "#### Rolling Window Algorithm\n", + "The time window is considered from the fraction of time at which the request is made plus the time window length.\n", + "\n", + "With a rate limit of 2 messages per second, the two messages sent at the 300th millisecond (m1) and 400th millisecond (m2), we'll count them as two messages starting from the 300th of that second to the 300th of the next second (making up one second).\n", + "\n", + "As a result, we will therefore throttle M3, M4 as shown above.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. High Level Design\n", + "\n", + "Once a new request arrives, the Web server first asks the Rate Limiter to decide if it will be served or throttled. If the request is not throttled, then it's passed to the API servers. \n", + "\n", + "![](images/rate_limiter_high_level_design.png)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Basic System Design and Algorithm\n", + "\n", + "Assume for each user, our rate limiter allows 3 requests/sec. \n", + "\n", + "For each user, store:\n", + "- a request count (how many requests the user has made)\n", + "- a timestamp when we started counting\n", + "\n", + "We can keep this in a hashtable, where:\n", + "\n", + "```python\n", + "# Key (userID): Value {count, start_time}\n", + "hashtable = {\n", + " 'userId0': {\n", + " 'count': 3, 'start_time': 1574866492\n", + " },\n", + " 'userId1': {\n", + " 'count': 1, 'start_time': 1574873716\n", + " },\n", + " ...\n", + "}\n", + "```\n", + "\n", + "When a new request comes in, the rate limiter will perform the following steps:\n", + "\n", + "1. If the `userID` does not exist in the hash-table, \n", + " - insert it, \n", + " - set `count` to 1 and set `start_time` to current epoch time\n", + "2. Otherwise, find the existing record of the userID, and \n", + " - if `current_time - start_time >= 1 minute`, reset `start_time` to be current time, \n", + " - set `count` to 1 and allow the request\n", + "3. If `current_time - start_time <= 1 minute` and\n", + " - If `count < 3`, increment the count and allow the request.\n", + " - If `count == 3`, reject the request.\n", + " \n", + "#### Problems with this Fixed Window Algorithm\n", + "1. We are resetting the `start_time` at the end of every minute, which means we can potentially allow twice the number of requests per minute.\n", + "\n", + "Imagine if a user sends 3 requests at the last second of a minute, they can immediately send 3 more requests at the very first second of the next minute, resulting in 6 requests in a span of two seconds. \n", + "\n", + "To fix this loophole, we'll use the sliding-window algorithm.\n", + "\n", + "![](images/fixed_window_problem.svg)\n", + "\n", + "2. Atomicity: The read and then write process can create a race condition. Imagine, a given user's current count = 2. If two seperate processes served each of these requests and concurrently read the count before either updated it, each process would erroneously think that the user had one more request to hit the rate limit. \n", + "\n", + "![](images/fixed_window_atomicity.svg)\n", + "\n", + "\n", + "#### Solutions\n", + "We can use a K-V store like Redis to store our key-value and solve the atomicity problem using [Redis lock](https://redis.io/topics/distlock) for the duration of the read-update operation. \n", + "This however, would slow down concurrent requests from the same user and introduce another layer of complexity.\n", + "\n", + " \n", + "\n", + "#### How much memory to store all the user data?\n", + "Assume the userID takes 8 bytes, epoch time needs 4 bytes, 2 bytes for count:\n", + "```\n", + "8 + 2 + 4 = 14 bytes\n", + "```\n", + "Let's assume our hash-table has an overhead of 30 bytes for each record. If we need to track 10 million users at any time:\n", + "```\n", + "Total memory = (14 + 30) bytes * 10 million users = 440,000,000 bytes => 440MB memory.\n", + "```\n", + "\n", + "If we assume that we need a 4-byte number to lock each user's record to solve the atomicity problems\n", + "```\n", + "Total memory = 4 bytes for lock + (14 + 30) bytes * 10 million users = 480,000,000 bytes => 480MB memory.\n", + "```\n", + "\n", + "This can fit in a single server. However, we wouldn't want to route all traffic in a single machine because of availability issue => that one server goes down for any reason, our only instance of the rate limiter service goes down with it. \n", + "\n", + "For instance, when rate limiting 1M users at 10 req/sec, this would be about 10 million queries per second for our rate limiter. This would be too much to handle for a single server. Practically, we can use Redis or Memcached in a distributed setup. We'll store all the data in remote Redis servers and all the Rate limiter servers will read/update these servers before serving or throttling any request.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Sliding Window Algorithm\n", + "\n", + "We can maintain a sliding window if we can keep track of each request per user.\n", + "\n", + "We will store the timestamp of each request in a [Redis Sorted Set](https://redis.io/docs/data-types/sorted-sets/).\n", + "\n", + "```python\n", + "hash_table = {\n", + " # userID: { Sorted Set }\n", + " 'userID-0': {1574860105, 1574881109, 1574890217 },\n", + " 'userID-1': {1574866488, 1574866493, 1574866499}\n", + " ...\n", + "}\n", + "```\n", + "Assume our rate limiter allows 3 requests/sec per user. \n", + "\n", + "When a new request comes in, the rate limiter will perform the following steps:\n", + "\n", + "1. Remove all timestamps from Sorted Set older than `1 second`.\n", + "2. Count elements in the set. Reject request if the count is greater than our throttling limit (3 for our case).\n", + "3. Insert current time in the sorted set and accept the request.\n", + "\n", + "#### Memory for storing user data?\n", + "Assume UserId takes 8 bytes, each epoch time takes 4 bytes.\n", + "\n", + "Now suppose we need a rate limiting of 500 requests per hour. Assume 20 bytes of overhead for hash-table and 20 bytes for sorted set. \n", + "\n", + "```\n", + "(8 + 4 + 20 bytes sorted set overhead) * 500 + (20 bytes hash-table overhead) = 12KB\n", + "```\n", + "\n", + "If we need to track 10 million users at any time:\n", + "```\n", + "Total memory = 12KB * 10 million users = 11718.75 MB ≈ 120 GB\n", + "```\n", + "\n", + "For 10M users, sliding window takes a lot of memory compared to fixed window; this won't scale well. We can combine the above two algorithms to optimize our memory usage.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "## 7. Sliding Window + Counters\n", + "What if we keep track of request counts for each user using multiple fixed time windows. \n", + "\n", + "For example, if rate limit is hourly, keep a count for **each minute** and calculate the sum of all counter in the past hour when we receive a new request.\n", + "\n", + "This reduces our memory footprint. Consider a rate-limit at 500 requests/hour, with an additional limit of 10 requests/minute. *This means that when the sum of the counters with timestamps in the past hour `> 500`, the user has exceeded the rate limit.*\n", + "\n", + "In addition, the user can't send more than 10 requests per minute. This would be a reasonable and practical consideration, as none of the real users would send frequent requests. Even if they do, they'll see success with retries since their limits get reset every minute.\n", + "\n", + "\n", + "We can store our counter in a [Redis Hash](https://redis.io/docs/data-types/hashes/) because it's very efficient in storing <100 keys.\n", + "Which each request, increment a counter in the hash, it also sets the hash to [expire](https://redis.io/commands/ttl) an hour later. We will normalize each time to a minute.\n", + "\n", + "\n", + "```\n", + "Rate limiting allowing 3 requests per minute for User1\n", + "\n", + "[Allow request] 7:00:00 AM ---- \"User1\": {1574860100: 1}\n", + "[Allow request] 7:01:05 AM ---- \"User1\": { 1574860100: 1, 1574860160: 1}\n", + "[Allow request] 7:01:20 AM ---- \"User1\": { 1574860100: 1, 1574860160: 2}\n", + "[Allow request] 7:01:20 AM ---- \"User1\": { 1574860100: 1, 1574860160: 3}\n", + "[Reject request] 7:01:45 AM ---- \"User1\": { 1574860100: 1, 1574860160: 3}\n", + "[Allow request] 7:02:20 AM ---- \"User1\": { 1574860160: 3, 1574860220: 1}\n", + "```\n", + "\n", + "#### How much memory to store all user data?\n", + "\n", + "We'll need: \n", + "```\n", + "\n", + "UserID = 8 bytes\n", + "Counter = 2 bytes\n", + "Epoch time = 4 bytes\n", + "\n", + "Since we keep a count per minute, at max, we need 60 entries per user\n", + "8 + (4 + 2 + 20 Redis-hash overhead) * 60 entries + 20 Hash-table overhead = 1.6KB\n", + "```\n", + "\n", + "If we need to track 10 million users at any time:\n", + "```\n", + "\n", + "Total memory = 1.6KB * 10 million => 16 GB \n", + "( 92% less memory than the simple sliding window algorithm )\n", + "\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 8. Data Sharding and Caching\n", + "We can shard by `UserID` to distribute user data across different partitions.\n", + "\n", + "For fault tolerance and replication we should use Consistent Hashing. Consistent hashing is a very useful strategy for distributed caching system and DHTs. It allows us to distribute data across a cluster in such a way that will minimize reorganization when nodes are added or removed (resizing). \n", + "\n", + "  \n", + "\n", + "## Caching\n", + "We can get huge benefits from caching recent active users. \n", + "\n", + "Our app servers can quickly check if the cache has the desired record before hitting backend servers. Our rate limiter can benefit from the **Write-back cache** by updating all counters and timestamps in cache only. The write to the permanent storage can be done at fixed intervals. This way we can ensure minimum latency added to the user's request by the rate limiter. The reads can always hit the cache first; which will be extremely useful once the user has hit the rate limit and the rate limiter will only be reading data without any updates. \n", + "\n", + "Least Recently Used (LRU) can be a reasonable eviction policy for the our cache.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sample response:\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 9. Throttling Response\n", + "We can return a 429 status code: Too Many Requests whenever the user exceeds the rate limit.\n", + "\n", + "| Header Name | Description |\n", + "| :------------------- | :---------------------------------------------------------------------------- |\n", + "| RateLimit-Exceeded | The specific limit that has been exceeded. |\n", + "| Retry-After | The number of seconds that the client should wait before retrying a request. |\n", + "\n", + "\n", + "```\n", + "HTTP/1.1 429 Too Many Requests\n", + "Transfer-Encoding: chunked\n", + "Retry-After: 5\n", + "request-id: ff4751b7-d289-01a0-a6dd-9b3541c077fe\n", + "RateLimit-Exceeded: 60\n", + "Cache-Control: private\n", + "Content-Type: application/json;charset=utf-8\n", + "```\n", + "```json\r\n", + "{\r\n", + " \"error\": {\r\n", + " \"code\": \"ClientThrottled\",\r\n", + " \"message\": \"Client application is over its resource limit.\"\r\n", + " }\r\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.17" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/executed_designing_instagram.ipynb b/executed_designing_instagram.ipynb new file mode 100644 index 0000000..46db8fc --- /dev/null +++ b/executed_designing_instagram.ipynb @@ -0,0 +1,330 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Designing Instagram\n", + "\n", + "Let's design a photo-sharing service like IG, where users upload photos to share them with other users.\n", + "\n", + "Instagram enables its users to upload and share their photos and videos with other users. Users can choose to share information publicly or privately. Anything shared publicly can be seen by any other user, whereas privately shared content can only be accessed by a specified set of people.\n", + "\n", + "We plan to design a simpler version of Instagram, where a user can share photos and can also follow other users. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Requirements and Goals of the System\n", + "\n", + "#### Functional requirements\n", + "1. Users should be able to upload/download/view photos\n", + "2. Users can perform searches baed on photo/video titles\n", + "3. Users can follow other users\n", + "4. The system should generate Newsfeed consisting top photos from all the people the user follows\n", + "\n", + "#### Non-functional requirements\n", + "1. The service needs to be highly available\n", + "2. The acceptable latency is 200ms for News Feed generation\n", + "3. The system should be highly reliable; any uploaded photo/video should never be lost." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Capacity Estimation and Constraints\n", + "\n", + "The system would be read-heavy, so we'll focus on buiding a system that can retrieve photos quickly.\n", + "\n", + "- Assume we have 500M total users, with 1M daily active users.\n", + "- 2M new photos every day, 23 new photos per second.\n", + "- Average photo file size ~= 200KB\n", + "- Total space required for a 1 day of photos => \n", + " ```\n", + " 2M * 200KB => 400GB\n", + " ```\n", + "- Total space for 10 years:\n", + " ```\n", + " 400GB * 365 days * 10 years ~= 1425 TB => 1.4 Petabytes\n", + " ```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. High Level System Design\n", + "\n", + "At a high-level, we need to support two scenarios: uploading photos and view/searching photos.\n", + "\n", + "We need object storage servers to store photos and also some DB servers to store metadata information about the photos.\n", + "\n", + "![](images/instagram_high_level_design.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Database Schema\n", + "\n", + "> The DB schema will help understand data flow among various components and later guide us towards data partitioning.\n", + "\n", + "We need to store user data, their photos, and people they follow.\n", + "\n", + "Photo table \n", + "\n", + ">\n", + "\n", + "| Photo | \n", + "| --- | \n", + "| PhotoID: int (PK) | \n", + "| UserID: int |\n", + "| PhotoLatitude: int |\n", + "| PhotoLongitude: int |\n", + "| UserLatitude: int |\n", + "| UserLongitude: int |\n", + "| CreationDate: datetime |\n", + "\n", + ">\n", + "\n", + "| User |\n", + "| --- |\n", + "| UserID: int (PK) |\n", + "| Name: varchar(20) |\n", + "| DOB: datetime |\n", + "| CreatedAt: datetime |\n", + "| LastLogin: datetime |\n", + "\n", + ">\n", + "\n", + "|UserFollow | |\n", + "|---|---|\n", + "| PK | UserID1: int |\n", + "| PK | UserID2: int|\n", + "\n", + "\n", + "We could use an RDBMS like MySQL since we require joins. But relational DB come with their challenges, especially when scaling. So we can store the schema in a distributed wide-column NoSQL datastore like [Cassandra](https://en.wikipedia.org/wiki/Apache_Cassandra).\n", + "All the photo metadata can go to a table where the `'key'` is the `PhotoID` and the `'value'` would be an object containing Photo related details.\n", + "Cassandra and most key-value stores maintain a certain number of replicas to offer reliability. Also in these data stores, deletes don't get applied instantly, data is retained for a few days to support undeleting, before getting removed permanently.\n", + "\n", + "We can store the actual photos in as distributed file storage system like [Hadoop](https://en.wikipedia.org/wiki/Apache_Hadoop) or [S3](https://en.wikipedia.org/wiki/Amazon_S3).\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Data Size Estimation\n", + "\n", + "Let's estimate how much storage we'll need for the next 10 years.\n", + "\n", + "### User\n", + "\n", + "Assuming each int and datetime is 4 bytes, each row in User table will have:\n", + "```\n", + "UserID(4 bytes) + Name(20 bytes) + Email(32 bytes) + DOB(4 bytes) + \n", + "CreatedAt(4 bytes) + LastLogin(4 bytes) = 68 bytes\n", + "```\n", + "We have 500 million users:\n", + "```\n", + "500 million * 68 bytes ~= 32 GB\n", + "```\n", + "\n", + "### Photo\n", + "\n", + "Each row in Photos table will have:\n", + "```\n", + "PhotoID (4 bytes) + UserID (4 bytes) + PhotoPath (256 bytes) + PhotoLatitude (4 bytes) + PhotoLongitude(4 bytes) + UserLatitude (4 bytes) + UserLongitude (4 bytes) + CreationAt (4 bytes) = 284 bytes\n", + "```\n", + "We get 2M photos every day, so for one day we need:\n", + "```\n", + "2 M * 284 bytes ~= 0.5 GB per day\n", + "\n", + "For 10 years we'll need:\n", + "0.5GB per day * 365 days * 10 years => 1.88 TB\n", + "```\n", + "\n", + "### UserFollow\n", + "\n", + "Each row will have 8 bytes. Assume on average, each user follows 500 users, We would need 1.82 TB of storage for the UserFollow Table:\n", + "```\n", + "8 bytes * 500 followers * 500M users ~= 1.82 TB\n", + "```\n", + "Total space required for the DB tables for 10 years will be:\n", + "```\n", + "32 GB + 1.88 + 1.82 ~= 3.7TB\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Component Design\n", + "\n", + "Photo uploads (or writes) can be slow as they have to go to the disk, while reads will be faster, especially if they are being served from cache.\n", + "\n", + "Uploading users can consume all available connections, as uploading is a slow process, meaning reads can't be served if the system gets busy with all the write requests. \n", + "\n", + "We should keep in mind that all web servers have a connection limit. If we assume that a web server can have a maximum of 500 connections at any time, then it cant have more than 500 concurrent reads and uploads. To handle this bottleneck, we can split reads and writes into seperate services –– dedicated servers for reads and different servers for writes/uploads to ensure they don't hog the system.\n", + "\n", + "> Also, separating reads from writes will allow us to scale and optimize each operation independently.\n", + "\n", + "![](images/ig_read_writes.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Reliability and Redundancy\n", + "\n", + "Losing files is not an option for our service. \n", + "\n", + "We'll store multiple copies of each file so that if one storage server dies, we can retrieve another copy on a different storage server.\n", + "\n", + "This principle also applies to the rest of the system. If we want high availability, we need to have multiple replicas of services running, so that if a few services go down, the system remains available and running. \n", + "\n", + "> Redundancy removes the single point of failure in the system, taking control after a failover.\n", + "\n", + "If there are two instances of the same service running on production and one fails, the system can failover to the healthy copy. Failover can happen automatically or be done manually.\n", + "\n", + "![](images/ig_redundancy.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. Data Sharding\n", + "\n", + "### a. Partitioning based on UserID\n", + "\n", + "We can shard based on UserID, so that we keep all photos of a user on the same shard. If one DB shard is 1TB, we need 4 shards to store 3.7TB of data. Assume for better performance and scalability, we keep 10 shards.\n", + "\n", + "We'll find the shard number by doing (UserID % 10) and storing the data there. To uniquely identify each photo in the system, we can append shard number to each PhotoID.\n", + "\n", + "**How do we generate PhotoIDs?** Each DB shard can have its own auto-increment sequence for PhotoIDs and since we will append ShardID with each PhotoID, it will make it unique throughout the system.\n", + "\n", + "Issues with this approach:\n", + "- How would we handle hot users? IG celebrities have a lot of followers, meaning many people see any photo they upload.\n", + "- Some users will have more photos than others, so data will be unevenly distrubuted in the partitions.\n", + "- Storying all photos of a user on one shard can cause issues like unavailability if that shard is down, or higher latency if it's serving high loads.\n", + "\n", + "\n", + "### b. Partitioning based on PhotoID\n", + "\n", + "If we generate unique PhotoIDs first, then find a shard number using\n", + "(PhotoID % 10), the above problems will be solved.\n", + "We wont need to append ShardID with PhotoID since the PhotoID will itself be unique throughout the system.\n", + "\n", + "**How to generate photoIDs?** We can dedicate a seperate DB instance to generate auto-incrementing IDs. If our PhotoID can fit into 64 bits, we can define a table containing only 64 bit ID field. So whenever we want to add a photo, we can insert a new row in Photo table and take that ID to be the new photo's PhotoID.\n", + "\n", + "**Wouldnt this key generating DB be a single point of failure?**\n", + "Yes. \n", + "\n", + "A workaround for this is to define two such DBs:\n", + "- one generates even numbered IDs\n", + "- the other generates odd numbered IDs\n", + "\n", + "```\n", + "KeyGeneratingServer1:\n", + "auto-increment-increment = 2\n", + "auto-increment-offset = 1\n", + " \n", + "KeyGeneratingServer2:\n", + "auto-increment-increment = 2\n", + "auto-increment-offset = 2\n", + "```\n", + "Then, we can put a load balancer in front of both DBs to round robin between them and to deal with downtime.\n", + "\n", + "**Alternatively**, we can have a standalone Key Generation Service (KGS) that generates random six letter strings beforehand and stores them in a database (let’s call it key-DB)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 9. Ranking and NewsFeed Generation\n", + "\n", + "We need to fetch the latest, most popular photos of the people the user follows.\n", + "\n", + "- First, get a list of people the user follows and fetch metadata info of latest 100 photos for each\n", + "- Submit all photos to ranking algorithm which will determine the top 100 photos (based on recency, likeness, etc.)\n", + "- Return them to the user as news feed.\n", + "\n", + "To improve the efficiency, we can pre-generate the News Feed and store it in a separate table.\n", + "\n", + "#### **Pre-generating the News Feed**:\n", + "\n", + "- Dedicate servers that continously generate users' News feeds and store them in a **`UserNewsFeed`** table. When any user needs the latest photos, we simply query this table.\n", + "- When servers need to generate again, they will first query `UserNewsFeed` table to find last time the News feed was generated. Then, new News Feed data will be generated from that time onwards.\n", + "\n", + "#### **How do we send News Feed content to users?**\n", + "\n", + "**1. Pull**: Clients pull content from server on a regular/ or manually.\n", + "Problems:\n", + "- New data not showing until client issues a pull request\n", + "- Most of the time pull requests will result in empty response if there's no data. (Frustrating the user)\n", + "\n", + "**2. Push:** Servers push new data to users as soon as it is available. Users maintain a long poll request with the server. A possible problem is, a user who follows a lot of people or a celebrity user who has millions of followers; the server will have to push lots of updates quite frequently, straining the server.\n", + "\n", + "**3. Hybrid:** \n", + "- We can adopt a hybrid of the two approaches. Users with lots of followers will use a pull-based model. We only push data to those users who have < 1000 followers. \n", + "- Server pushes updates to all users not more than a certain frequency, and letting users with a lot of updates to regularly pull data. \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 10. Cache and Load balancing\n", + "\n", + "Our service will need a massive-scale photo delivery system to serve the globally distributed users.\n", + "\n", + "We'll push content closer to the user using a large number of georgraphically distributed CDNs.\n", + "\n", + "We can also have a cache for metadata servers to cache hot DB rows. Memcache can cache the data and application servers before hitting the actual DB. \n", + "For cache eviction, we can use Least Recently User (LRU), where we discard the least recently viewed rows out of the cache memory.\n", + "\n", + "\n", + "#### **How can we build a more intelligent cache?** \n", + "\n", + "If we go with 80-20 rule, 20% of photo reads generates 80% of traffic. This means that certain photos are so popular that the majority of people view/search them. Therefore, we can try caching 20% of daily read volume of photos and metadata. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.17" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/executed_designing_twitter.ipynb b/executed_designing_twitter.ipynb new file mode 100644 index 0000000..980641d --- /dev/null +++ b/executed_designing_twitter.ipynb @@ -0,0 +1,369 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Designing Twitter\n", + "\n", + "Twitter is an online social networking service where users post and read short 140-character messages called \"tweets\". Registered Users can post and read tweets, but those not registered can only read them. \n", + "\n", + "\n", + "## 1. Requirements and System Goals\n", + "\n", + "### Functional Requirements\n", + "1. Users should be able to post new tweets.\n", + "2. A user should be able to follow other users.\n", + "3. Users should be able to mark tweets as favorites.\n", + "4. Tweets can contain photos and videos.\n", + "5. A user should have a timeline consting of top tweets from all the people the user follows.\n", + "\n", + "### Non-functional Requirements\n", + "1. Our service needs to be highly available.\n", + "2. Acceptance latency of the sytstem is 200ms for timeline generation.\n", + "3. Consistency can take a hit (in the interest of availability); if user doesn't see a tweet for a while, it should be fine.\n", + "\n", + "### Extended Requirements\n", + "1. Searching tweets.\n", + "2. Replying to a tweet.\n", + "3. Trending topics - current hot topics.\n", + "4. Tagging other users.\n", + "5. Tweet notification.\n", + "6. Suggestions on who to follow." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Capacity Estimation and Constraints\n", + "\n", + "Let's assume we have 1 billion users, with 200 million daily active users (DAU). \n", + "Also assume we have 100 million new tweets every day, and on average each user follows 200 people.\n", + "\n", + "**How many favorites per day?** If on average, each user favorites 5 tweets per day, we have:\n", + "```\n", + "200M users * 5 => 1 billion favorites.\n", + "```\n", + "\n", + "**How many total tweet-views?** Let's assume on average a user visits their timeline twice a day and visits 5 other people's pages. On each page if a user sees 20 tweets, then the no. of views our system will generate is:\n", + "```\n", + "200M DAU * ((2 + 5) * 20 tweets) => 28B/day\n", + "```\n", + "\n", + "#### Storage Estimates\n", + "Let's say each tweet has 140 characters and we need two bytes to store a character without compression. Assume we need 30 bytes to store metadata with each tweet (like ID, timestamps, etc.). Total storage we would need is:\n", + "```\n", + "100M new daily tweets * ((140 * 2) + 30) bytes => ~28 GB/day\n", + "```\n", + "\n", + "Not all tweets will have media, let's assume that on average every fifth tweet has a photo and every tenth a video. Let's also assume on average, a photo = 0.5MB and a video = 5MB. This will lead us to have:\n", + "```\n", + " (100M/5 photos * 0.5MB) + (100M/10 videos * 5MB) ~= 60 TB/day\n", + "```\n", + "\n", + "#### Bandwidth Estimates\n", + "Since total ingress is 60TB per day, then it will translate to: \n", + "```\n", + "60TB / (24 * 60 * 60) ~= 690 MB/sec\n", + "```\n", + "Remember we have 28 billion tweets views in a day. We must show the photo of every tweet, but let's assume that the users watch every 3rd video they see in their timeline. So, total egress will be:\n", + "\n", + "```\n", + " (28Billion * 280 bytes) / 86400 of text ==> 93MB/s\n", + " + (28Billion/5 * 0.5MB) / 86400 of photos ==> ~32GB/s\n", + " + (28Billion/10/3 * 5MB) / 86400 of videos ==> ~54GB/s\n", + " \n", + " Total ~= 85GB/sec\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. System APIs\n", + "\n", + "We can have a REST API to expose the functionality of our service. \n", + "\n", + "```python\n", + "tweet(\n", + " api_dev_key, # (string): The API developer key. Use to throttle users based on their allocated quota.\n", + " tweet_data, # (string): The text of the tweet, typically up to 140 characters.\n", + " tweet_location, # (string): Optional location (lat, long) this Tweet refers to.\n", + " user_location, # (string): optional user's location.\n", + " media_ids, # (optional list of media_ids to associated with the tweet. (all media - photo, videos needs to be uploaded separately)\n", + ")\n", + "```\n", + "Returns: (string)\n", + " A successful post will return the URL to access that tweet. Otherwise, return an appropriate HTTP error.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. High Level System Design\n", + "We need a system that can efficiently store all the new tweets, \n", + "i.e\n", + "- store `100M/86400sec => 1150 tweets per second` \n", + "- and read `28billion/86400s => 325,000 tweets per second`.\n", + "\n", + "It's clears that from the requirements, the system will be **read-heavy**.\n", + "\n", + "At a high level:\n", + "- we need multiple application servers to serve all these requests with load balancers in front of them for traffic distribution. \n", + "- On the backend, we need an efficent datastore that will store all the new tweets and can support huge read numbers. \n", + "- We also need file storage to store photos and videos.\n", + "\n", + "This traffic will be distributed unevenly throughout the day, though, at peak time we expect at leas a few thousand write requests and around 1M read requests per second. \n", + "**We should keep this in mind while designing the architecture of our system.**\n", + "\n", + "\n", + "![](images/twitter_high_level.png)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Database Schema\n", + "We need to store data about users, their tweets, their favorite tweets, and people they follow.\n", + "\n", + "![](images/twitter_db_schema.svg)\n", + "\n", + "For choosing between SQL or NoSQL, check out Designing Instagram from the README." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Data Sharding\n", + "\n", + "We have a huge number of tweets every day. We need to distribute our data onto multiple machines such that reads/writes are efficient.\n", + "\n", + "\n", + "#### Sharding based on UserID\n", + "We can try storing a user's data on one server. While storing:\n", + "- Pass a UserID to our hash function that will map the user to a DB server where we'll store all of their data. (tweets, favorites, follows, etc.)\n", + "- While querying for their data, we can ask the hash function where to find it and read it from there. \n", + "\n", + "Issues:\n", + "1. What if a user becomes hot? There will be lots of queries on the server holding that user. This high load will affect the service's performance.\n", + "2. Over time, some users will have more data compared to others. Maintaining a uniform distribution of growing data is quite difficult. \n", + "\n", + "#### Sharding based on TweetID\n", + "- Hash function maps each TweetID to a random server where we store that tweet.\n", + "- Searching a tweet will query all servers, and each server will return a set of tweets.\n", + "- A centralized server will aggregate the results to return them to the user.\n", + "\n", + "**To generate a user's timeline:**\n", + "1. App server will find all the people the user follows.\n", + "2. App server will send a query to all DB servers to find tweets from these people.\n", + "3. Each DB server will find tweets for each user, sort them by recency and return top tweets.\n", + "4. App server will merge all results and sort them again to return the top results to the user.\n", + "\n", + "This solves the problem of hot users.\n", + "\n", + "Issues with this approach:\n", + "- We have to query all DB partitions to find tweets for a user, leading to higher latencies.\n", + "\n", + "> We can improve the performance by caching hot tweets in front of the DB servers.\n", + "\n", + "#### Sharding based on Tweet creation time\n", + "\n", + "Storing tweets based on creation timestamp will help us to fetch all top tweets quickly and we only have to query a very small set of database servers.\n", + "\n", + "Issues:\n", + "- Traffic load won't be distributed. e.g when writing, all new tweets will be going to one DB server, and remaining DB servers sit idle. When reading, a database server holding the latest data will have high load as compared to servers holding old data.\n", + "\n", + "#### Sharding by TweetID + Tweet creation time\n", + "Each tweetID should be universally unique and contain a timestamp.\n", + "We can use epoch time for this. \n", + "\n", + "First part of TweetID will be a timestamp, second part an auto-incrementing number. We can then figure out the shard number from this TweetId and store it there.\n", + "\n", + "What could be the size of our TweetID?\n", + "If our epoch time started today, the number of bits we need to store the number of seconds for the next 50 years:\n", + "\n", + "```\n", + "Number of seconds for the next 50 years:\n", + " 86400 sec/day * 365 days * 50 years ==> 1.6 billion seconds.\n", + "\n", + "```\n", + "\n", + "![](images/twitter_epoch.png)\n", + "\n", + "We would need 31 bits to store the epoch time.\n", + "```\n", + "2 ^ 31 ==> ~ 2 billion seconds\n", + "```\n", + "\n", + "Since on average we expect 1150 new tweets every second i.e \n", + " ```\n", + " (100M daily / 86400 seconds) ==> 1150 tweets/sec\n", + " ```\n", + "we can allocate 17 bits to store auto incremented sequence. This makes our tweetID 48 bits long. \n", + "\n", + "Every second, we can store 2^17(130k) new tweets. \n", + "We can reset our auto incrementing sequence every second. For fault tolerance and better performance, we can have two DB servers to generate auto-incrementing keys; one odd numbers and one even numbered keys.\n", + "\n", + "If we assume our current epoch seconds begins now, TweetIDs will look like this:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2025-05-21T08:11:28.586376Z", + "iopub.status.busy": "2025-05-21T08:11:28.585222Z", + "iopub.status.idle": "2025-05-21T08:11:28.596309Z", + "shell.execute_reply": "2025-05-21T08:11:28.595286Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch - autoincrement\n", + "1571691220-000001\n", + "1571691220-000002\n", + "1571691220-000003\n", + "1571691220-000004\n" + ] + } + ], + "source": [ + "epoch = 1571691220\n", + "print('epoch - autoincrement')\n", + "for i in range(1,5):\n", + " print(f'{epoch}-{i:06}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we make our TweetID 64bits (8 bytes) long, we can easily store tweets for the next 100 years. \n", + "\n", + "In the approach above, we still have to query all servers for timeline generation, but the reads/writes will be substancially quicker.\n", + "\n", + "- Since we don't have any secondary index (on creation time) this will reduce write latency.\n", + "- While reading, we don't need to filter on creatiion-time as our primary key has epoch time included in it." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Caching\n", + "\n", + "We can have a cache for DB servers to cache hot tweets and users.\n", + "Memcache can be used here to store the whole tweet objects.\n", + "\n", + "> Application servers before hitting the DB, can quickly check if the cache has desired tweets.\n", + "\n", + "#### Cache replacement policy\n", + "When cache is full, we want to replace a tweet with a newer/hotter tweet. Least Recently Used(LRU) policy can be used to discard the least recently viewed tweet first.\n", + "\n", + "#### Intelligent caching?\n", + "If we go with the 80-20 rule, 20% of tweets generating 80% of read traffic, meaning certain tweets are so popular that a majority of people read them. Therefore, we can try to cache 20% of daily read volume from each shard.\n", + "\n", + "#### Caching latest data?\n", + "Let's say if 80% of our users see tweets from the last 3 days only, we can cache all tweets from past 3 days. \n", + "\n", + "We are getting 100M tweets or 30GB of new data every day(without photos or videos). If we want to store all the tweets from last 3 days, we will need 100GB of memory. This data can easily fit into one cache server, but we should replicate it onto multiple servers to distribute all the read traffic to reduce the load on cache servers. \n", + "\n", + "Whenever we are generating a user's timeline, we can ask the cache servers if they have all the recent tweets for that user, If yes, we can simply return all the data from the cache. If we don't have enough, we have to query backend server to fetch the data. We can also cache photos/videos from the last 3 days.\n", + "\n", + "### Cache structure\n", + "Our cache would be a hash table. \n", + "- Key would be `OwnerID` and value would be a doubly linked list containing all tweets from that user in the past 3 days.\n", + "- Since we retrieve the most recent tweets first, we can insert new tweets at the head of linked list.\n", + "- Older tweets will be at the tail of the linked list. \n", + "- We can remove tweets from the tail to make space for newer tweets\n", + "\n", + "![](images/twitter_component_design.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. Load Balancing\n", + "We can add Load balancing layer at 3 places:\n", + "1. Between clients and application servers\n", + "2. Between app servers and DB replication servers\n", + "3. Between Aggregation servers and Cache servers\n", + "\n", + "We can adopt a simple round robin approach to distribute incoming requests equally among servers.\n", + "Benefits of this LB approach:\n", + "- Simple to implement with no overhead\n", + "- If a server is dead, LB will take it out from the rotation and will stop sending any traffic to it.\n", + "\n", + "> Problem with Round Robin is that it doesn't know if a server is overloaded with requests or if it's slow. It won't stop sending requests to that server. To fix this, the LB can periodically query the backend server about its load and adjusts traffic to it based on that." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 9. Extended Requirements\n", + "\n", + "**Trending Topics:** We can cache most frequently occuring hashtags or search queries in the last N seconds and update them after every M seconds. We can rank trending topics based on like frequency, number of retweets, etc.\n", + "\n", + "**Suggestions on who to follow:** We can suggest friends of people someone follows. Wecan go two or three degrees of separation down to find famous people for the suggestions. Give preference to people with more followers.\n", + "\n", + "We can use Machine Learning to offer suggestions based on reoccuring patterns like, common followers if the person is following this user, common location or interests, etc.\n", + "\n", + "**How do we serve feeds:** We can pre-generate the feed to improve efficiency; for details see [Ranking and time generation under Designing Instagram](designing_instagram.md)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 10. Monitoring\n", + "We should constantly collect data to get an insight into how our system is doing. We can collect:\n", + "- New tweets per day/second. \n", + "- What time the daily peak is.\n", + "- Timeline delivery stats, how many tweets per second our service is delivering.\n", + "- Average latency that is seen by the user to refresh their timeline.\n", + "\n", + "By monitoring these, we will realize if we need more replication, load balancing or caching." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 11. Availability\n", + "Since our system is read-heavy, we can have multiple secondary database servers for each DB partition. Seconday servers will be used for read traffic only. All writes will first go to the primary server and then will be replicated to secondary servers. This also implements fault tolerance, since whenever the primary server goes down, we can failover to a secondary server." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.17" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/requirements.txt b/requirements.txt index 7b5ff3b..173eff6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,21 +1,17 @@ -asn1crypto==0.24.0 -certifi==2024.7.4 -cffi==1.12.2 -chardet==3.0.4 -cryptography==42.0.4 -cycler==0.10.0 -idna==3.7 -kiwisolver==1.1.0 -libarchive-c==2.8 -pycosat==0.6.3 -pycparser==2.19 -pyOpenSSL==19.0.0 -pyparsing==2.4.0 -PySocks==1.6.8 -python-dateutil==2.8.0 -ruamel-yaml==0.15.46 -six==1.12.0 -torch==1.1.0 -tqdm==4.66.3 -urllib3==1.26.19 -pytest==5.3.1 +# Core testing +pytest +nbval + +# Common utilities & jupyter/nbval underlying dependencies +six +python-dateutil +ipykernel # For executing notebooks +nbformat # For reading/writing notebook structure +jupyter_core # Common core utilities for Jupyter +# Network/Security - often pulled in by requests/urllib3 or similar +certifi +idna +urllib3 +# cryptography might be needed by pyOpenSSL or other security aspects of web requests +cryptography +pyOpenSSL