From 7e73d62f1fa8de931e5984f44b00b9effcb4ed1e Mon Sep 17 00:00:00 2001 From: eyrei123 <88923476+eyrei123@users.noreply.github.com> Date: Tue, 14 Nov 2023 20:22:05 +0000 Subject: [PATCH 1/5] Add files via upload --- Solution.ipynb | 567 +++++++++++++++++++++++++++++++++++ james_bond_data.csv | 28 ++ james_bond_data.json | 1 + james_bond_data.parquet | Bin 0 -> 11050 bytes james_bond_data.xlsx | Bin 0 -> 11269 bytes james_bond_data_cleansed.csv | 26 ++ 6 files changed, 622 insertions(+) create mode 100644 Solution.ipynb create mode 100644 james_bond_data.csv create mode 100644 james_bond_data.json create mode 100644 james_bond_data.parquet create mode 100644 james_bond_data.xlsx create mode 100644 james_bond_data_cleansed.csv diff --git a/Solution.ipynb b/Solution.ipynb new file mode 100644 index 0000000000..356b6682e7 --- /dev/null +++ b/Solution.ipynb @@ -0,0 +1,567 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "83ad2114-5ed8-4a90-85fa-adea5eda4392", + "metadata": {}, + "source": [ + "## Reading Data From CSV Files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a360772e-7829-4c15-9af9-d4596efc7351", + "metadata": {}, + "outputs": [], + "source": [ + "! python -m pip install pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1148ca58-9a4d-42ed-a43a-e3b8b3359c2d", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "james_bond_df = pd.read_csv(\"james_bond_data.csv\")\n", + "james_bond_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "e47c1f9b-b390-4035-956b-622615b57f32", + "metadata": {}, + "source": [ + "## Reading Data From Other Sources" + ] + }, + { + "cell_type": "markdown", + "id": "47a0e4a6-0ed9-4253-9833-0ad22c49b968", + "metadata": {}, + "source": [ + "### Reading Excel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0364b81-64a0-4098-89fc-e58bd6d68257", + "metadata": {}, + "outputs": [], + "source": [ + "! python -m pip install openpyxl" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8302139f-52dc-4f95-aa9a-96040ae5d82b", + "metadata": {}, + "outputs": [], + "source": [ + "import openpyxl\n", + "import pandas as pd\n", + "\n", + "james_bond_df_excel = pd.read_excel(\"james_bond_data.xlsx\")\n", + "james_bond_df_excel.head()" + ] + }, + { + "cell_type": "markdown", + "id": "1d85aee9-cfeb-460b-9fe8-f3c7e7dfb764", + "metadata": {}, + "source": [ + "### Reading JSON" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7465cd11-dad4-4741-9372-f825b28c33d6", + "metadata": { + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "james_bond_df_json = pd.read_json(\"james_bond_data.json\")\n", + "james_bond_df_json.head()" + ] + }, + { + "cell_type": "markdown", + "id": "69f884c2-92e8-4db3-bd63-84007f654808", + "metadata": {}, + "source": [ + "### Scraping HTML" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b902722d-9648-4124-80b0-64004342170d", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install lxml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fb2ff9c-3030-4f4a-be30-c2ab68452a21", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_df_html = pd.read_html(\n", + " \"https://en.wikipedia.org/wiki/List_of_James_Bond_novels_and_short_stories\"\n", + ")\n", + "james_bond_df_html[1].head()" + ] + }, + { + "cell_type": "markdown", + "id": "be4a1143-c966-4056-8a5e-3bdebe2a9b1f", + "metadata": {}, + "source": [ + "### Reading Parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f36ef600-e6ba-4cc6-9ee3-0cbf369a4be2", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install pyarrow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd6f496d-aa94-43ce-9e97-6f01108df47b", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_df_parquet = pd.read_parquet(\"james_bond_data.parquet\")\n", + "james_bond_df_parquet.head()" + ] + }, + { + "cell_type": "markdown", + "id": "e432b28e-257b-422b-b2f8-06f41608391b", + "metadata": {}, + "source": [ + "## Dealing With Missing Data and Invalid Data Types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38eb1abb-9f89-4a53-9e77-f7c71dbeff18", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b29d5a34-c930-4ce2-898c-b9e8aa7f771d", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_df[james_bond_df.isna().any(axis=\"columns\")]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1db9201a-11c1-4cdd-9625-d70cee736191", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_df.at[10, \"Avg_User_IMDB\"] = 7.1\n", + "james_bond_df.at[10, \"Avg_User_Rtn_Tom\"] = 6.8" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "001996e3-2fce-4228-a873-b78eef613bba", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_df[[\"US_Gross\", \"World_Gross\", \"Budget ($ 000s)\", \"Film_Length\"]].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "880e4710-1c11-4de2-a2c3-97a9672ce6f7", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_df[\"US_Gross\"] = (\n", + " james_bond_df[\"US_Gross\"].replace(\"[$,]\", \"\", regex=True)\n", + ").astype(float)\n", + "\n", + "james_bond_df[\"World_Gross\"] = (\n", + " james_bond_df[\"World_Gross\"].replace(\"[$,]\", \"\", regex=True)\n", + ").astype(float)\n", + "\n", + "james_bond_df[\"Budget ($ 000s)\"] = (\n", + " james_bond_df[\"Budget ($ 000s)\"].replace(\"[$,]\", \"\", regex=True)\n", + ").astype(float)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae9c1d1b-a620-43c5-a199-eb6a7bff7ce2", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_df[\"Film_Length\"] = (\n", + " james_bond_df[\"Film_Length\"].str.rstrip(\"mins\").astype(int)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed0ead0e-7310-4c82-86d5-2480a95f1525", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_df[\"Release\"] = pd.to_datetime(james_bond_df[\"Release\"], format=\"%B, %Y\")\n", + "james_bond_df[\"Release_Year\"] = james_bond_df[\"Release\"].dt.year" + ] + }, + { + "cell_type": "markdown", + "id": "89653d81-3bcd-4078-83cb-ad4b2fa560e6", + "metadata": {}, + "source": [ + "## Dealing With Inconsistencies in Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc483320-7895-4368-a672-b98f8d0c9755", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_df[\"Budget ($ 000s)\"] = james_bond_df[\"Budget ($ 000s)\"] * 1000\n", + "james_bond_df.rename(columns={\"Budget ($ 000s)\": \"Budget\"}, inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "3e129b32-5e66-41cb-b938-8fd58bb94116", + "metadata": {}, + "source": [ + "## Removing Duplicate Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be7aad8b-ef3f-48a6-a9a0-de909133921f", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_df[\"Movie\"].value_counts().head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20067efb-e7c7-4690-b483-1d29847ad24f", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "duplicate_movies = [\"The Man with the Golden Gun\", \"The Living Daylights\"]\n", + "james_bond_df[james_bond_df[\"Movie\"].isin(duplicate_movies)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c98c7640-1472-4869-9fdd-f070d665ae1d", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_df.drop_duplicates(inplace=True, ignore_index=True)" + ] + }, + { + "cell_type": "markdown", + "id": "8bdaa8b1-9f2e-46a5-b53a-c1ae4c201c99", + "metadata": {}, + "source": [ + "## Removing Typos" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e442e51a-28fd-42d7-94b0-aaf1abe5d9a8", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_df[\"Bond\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9863aa7-b5db-4ab1-be63-727ff437b63b", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_df[\"Bond\"] = james_bond_df[\"Bond\"].str.replace(\"Shawn\", \"Sean\")\n", + "james_bond_df[\"Bond\"] = james_bond_df[\"Bond\"].str.replace(\"MOORE\", \"Moore\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25da4a99-6b90-4785-aaa4-48bed819e9be", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_df[\"Bond\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a26b138d-72e5-4e15-a875-ee65023545d1", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_df[\"Bond_Car_MFG\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8260f6b1-6d7f-4338-95b7-8946d69a92e2", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_df[\"Bond_Car_MFG\"] = james_bond_df[\"Bond_Car_MFG\"].str.replace(\"Astin\", \"Aston\")" + ] + }, + { + "cell_type": "markdown", + "id": "50c80bc8-fdb9-4c28-af5a-cd6b66c7a01d", + "metadata": {}, + "source": [ + "## Checking for Invalid Outliers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81338285-2067-46fc-82f8-360d92ec7153", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_df[[\"Film_Length\", \"Martinis\"]].describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d84acfc8-fb2d-4c45-bfac-20469fc1de97", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_df[\"Film_Length\"].replace(1200, 120, inplace=True)\n", + "james_bond_df[\"Martinis\"].replace(-6, 6, inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "52db1351-36ed-4104-a999-345ebbc62214", + "metadata": {}, + "source": [ + "## Storing Your Cleansed Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "575a774e-6913-41fb-8ff9-4d786f478007", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_df.to_csv(\"james_bond_data_cleansed.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "f50918ee-e61f-46b2-b0c2-1ffa2c62bbc0", + "metadata": {}, + "source": [ + "## Using Python for Data Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bee6d6cb-e418-4c1d-8b75-604b9ab2e63d", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install matplotlib scikit-learn numpy" + ] + }, + { + "cell_type": "markdown", + "id": "0246dcb1-88fc-4a3e-acc1-571037390e09", + "metadata": {}, + "source": [ + "## Performing a Regression Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27d0a3dd-e71a-4b8a-883c-40cb5c001f7e", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "#x = james_bond_df[\"Avg_User_IMDB\"].values.reshape(-1, 1)\n", + "#y = james_bond_df[\"Avg_User_Rtn_Tom\"].values.reshape(-1, 1)\n", + "\n", + "x = james_bond_df[\"Avg_User_IMDB\"].array.reshape(-1, 1)\n", + "y = james_bond_df[\"Avg_User_Rtn_Tom\"].array.reshape(-1, 1)\n", + "\n", + "plt.title(\"Scatter Plot of Ratings.\")\n", + "plt.xlabel(\"Average IMDb Rating\")\n", + "plt.ylabel(\"Average Rotten Tomatoes Rating\")\n", + "plt.scatter(x, y)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "669fb9d7-d744-4e6b-899e-a69aebec53ed", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LinearRegression\n", + "import matplotlib.pyplot as plt\n", + "\n", + "x = james_bond_df[\"Avg_User_IMDB\"].array.reshape(-1, 1)\n", + "y = james_bond_df[\"Avg_User_Rtn_Tom\"].array.reshape(-1, 1)\n", + "\n", + "model = LinearRegression()\n", + "model.fit(x, y)\n", + "\n", + "r_squared = f\"R-Squared: {round(model.score(x, y),2)}\"\n", + "best_fit_equation = f\"y={round(model.coef_[0][0], 4)}x{round(model.intercept_[0], 4)}\"\n", + "y_pred = model.predict(x)\n", + "\n", + "plt.title(\"Scatter Plot of Ratings.\")\n", + "plt.xlabel(\"Average IMDb Rating\")\n", + "plt.ylabel(\"Average Rotten Tomatoes Rating\")\n", + "plt.scatter(x, y)\n", + "plt.text(7.25, 5.5, r_squared, fontsize=10)\n", + "plt.text(7.25, 7, best_fit_equation, fontsize=10)\n", + "plt.plot(x, y_pred, color=\"red\")" + ] + }, + { + "cell_type": "markdown", + "id": "b38df412-c320-49fb-93ae-e253405537a8", + "metadata": {}, + "source": [ + "## Investigating a Statistical Distribution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "938e5942-e57f-4e41-99f1-215cfb37d0df", + "metadata": {}, + "outputs": [], + "source": [ + "film_length_groups = james_bond_df[\"Film_Length\"].value_counts(bins=7, sort=False)\n", + "film_length_groups.plot(kind=\"bar\", title=\"Film Length Distribution\").set(\n", + " xlabel=\"Time Range (mins)\", ylabel=\"Count\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff4e9955-baf4-48eb-b032-fbf55f439194", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_df[\"Film_Length\"].agg([\"mean\", \"max\", \"min\", \"std\"])" + ] + }, + { + "cell_type": "markdown", + "id": "1b14c433-c3a6-4484-bc0a-26825bd1e870", + "metadata": {}, + "source": [ + "## Finding No Relationship" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2bb83374-347f-4cf6-bc21-8180a003371d", + "metadata": {}, + "outputs": [], + "source": [ + "x = james_bond_df[\"Avg_User_IMDB\"].array.reshape(-1, 1)\n", + "y = james_bond_df[\"Kills_Bond\"].array.reshape(-1, 1)\n", + "\n", + "plt.title(\"Scatter Plot of Kills vs Ratings.\")\n", + "plt.xlabel(\"Average IMDb Rating\")\n", + "plt.ylabel(\"Kills By Bond\")\n", + "plt.scatter(x, y)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/james_bond_data.csv b/james_bond_data.csv new file mode 100644 index 0000000000..4a983b2201 --- /dev/null +++ b/james_bond_data.csv @@ -0,0 +1,28 @@ +Release,Movie,Bond,Bond_Car_MFG,US_Gross,World_Gross,Budget ($ 000s),Film_Length,Avg_User_IMDB,Avg_User_Rtn_Tom,Martinis,Kills_Bond +"June, 1962",Dr. No,Sean Connery,Sunbeam," $16,067,035.00 "," $59,567,035.00 "," $1,000.00 ",110 mins,7.3,7.7,2,4 +"August, 1963",From Russia with Love,Sean Connery,Bently," $24,800,000.00 "," $78,900,000.00 "," $2,000.00 ",115 mins,7.5,8,0,11 +"May, 1964",Goldfinger,Sean Connery,Aston Martin," $51,100,000.00 "," $124,900,000.00 "," $3,000.00 ",110 mins,7.8,8.4,1,9 +"September, 1965",Thunderball,Sean Connery,Aston Martin," $63,600,000.00 "," $141,200,000.00 "," $9,000.00 ",130 mins,7,6.8,0,20 +"November, 1967",You Only Live Twice,Sean Connery,Toyota," $43,100,000.00 "," $111,600,000.00 "," $9,500.00 ",117 mins,6.9,6.3,1,21 +"July, 1969",On Her Majesty's Secret Service,George Lazenby,Mercury," $22,800,000.00 "," $82,000,000.00 "," $8,000.00 ",142 mins,6.8,6.7,1,5 +"March, 1971",Diamonds Are Forever,Shawn Connery,Ford," $43,800,000.00 "," $116,000,000.00 "," $7,200.00 ",1200 mins,6.7,6.3,0,7 +"August, 1973",Live and Let Die,Roger Moore,AMC," $35,400,000.00 "," $161,800,000.00 "," $7,000.00 ",121 mins,6.8,5.9,0,8 +"July, 1974",The Man with the Golden Gun,Roger Moore,AMC," $21,000,000.00 "," $97,600,000.00 "," $7,000.00 ",125 mins,6.7,5.1,0,1 +"July, 1974",The Man with the Golden Gun,Roger Moore,AMC," $21,000,000.00 "," $97,600,000.00 "," $7,000.00 ",125 mins,6.7,5.1,0,1 +"April, 1977",The Spy Who Loved Me,Roger Moore,Lotus," $46,800,000.00 "," $185,400,000.00 "," $14,000.00 ",125 mins,,,1,31 +"October, 1979",Moonraker,Roger Moore,Lotus," $70,300,000.00 "," $210,300,000.00 "," $31,000.00 ",126 mins,6.2,5.7,1,12 +"June, 1981",For Your Eyes Only,Roger MOORE,Citroen," $54,800,000.00 "," $195,300,000.00 "," $28,000.00 ",127 mins,6.8,6.3,0,18 +"March, 1983",Octopussy,Roger Moore,Bajaj," $67,900,000.00 "," $187,500,000.00 "," $27,500.00 ",131 mins,6.5,5.3,0,15 +"October, 1985",A View to a Kill,Roger Moore,Rolls Royce," $50,327,960.00 "," $152,627,960.00 "," $30,000.00 ",131 mins,6.2,4.7,0,5 +"May, 1987",The Living Daylights,Timothy Dalton,Rolls Royce," $51,185,000.00 "," $191,200,000.00 "," $40,000.00 ",130 mins,6.7,6.3,2,13 +"May, 1987",The Living Daylights,Timothy Dalton,Rolls Royce," $51,185,000.00 "," $191,200,000.00 "," $40,000.00 ",130 mins,6.7,6.3,2,13 +"January, 1989",License to Kill,Timothy Dalton,Aston Martin," $34,667,015.00 "," $156,167,015.00 "," $42,000.00 ",133 mins,6.5,6,1,10 +"September, 1995",GoldenEye,Pierce Brosnan,BMW," $106,429,941.00 "," $356,429,941.00 "," $60,000.00 ",130 mins,7.2,6.9,1,47 +"July, 1997",Tomorrow Never Dies,Pierce Brosnan,Aston Martin," $125,304,276.00 "," $339,504,276.00 "," $110,000.00 ",119 mins,6.4,6,1,30 +"June, 1999",The World Is Not Enough,Pierce Brosnan,BMW," $126,930,660.00 "," $361,730,660.00 "," $135,000.00 ",128 mins,6.3,5.7,1,27 +"August, 2002",Die Another Day,Pierce Brosnan,Aston Martin," $160,942,139.00 "," $431,942,139.00 "," $142,000.00 ",133 mins,6,6.1,2,31 +"February, 2006",Casino Royale,Daniel Craig,Astin Martin," $167,365,000.00 "," $596,365,000.00 "," $102,000.00 ",144 mins,7.9,7.8,3,11 +"December, 2008",Quantum of Solace,Daniel Craig,Aston Martin," $169,368,427.00 "," $591,692,078.00 "," $230,000.00 ",106 mins,6.7,6.1,-6,16 +"November, 2012",Skyfall,Daniel Craig,Astin Martin," $304,360,277.00 "," $1,108,561,108.00 "," $200,000.00 ",143 mins,7.8,8.2,1,26 +"September, 2015",Spectre,Daniel Craig,Aston Martin," $200,074,175.00 "," $879,620,923.00 "," $245,000.00 ",148 mins,6.8,6.4,1,30 +"November, 2021",No Time to Die,Daniel Craig,Aston Martin," $160,891,007.00 "," $759,959,662.00 "," $275,000.00 ",163 mins,7.3,7.3,1,14 diff --git a/james_bond_data.json b/james_bond_data.json new file mode 100644 index 0000000000..852810b38e --- /dev/null +++ b/james_bond_data.json @@ -0,0 +1 @@ +{"Release":{"0":"June, 1962","1":"August, 1963","2":"May, 1964","3":"September, 1965","4":"November, 1967","5":"July, 1969","6":"March, 1971","7":"August, 1973","8":"July, 1974","9":"July, 1974","10":"April, 1977","11":"October, 1979","12":"June, 1981","13":"March, 1983","14":"October, 1985","15":"May, 1987","16":"May, 1987","17":"January, 1989","18":"September, 1995","19":"July, 1997","20":"June, 1999","21":"August, 2002","22":"February, 2006","23":"December, 2008","24":"November, 2012","25":"September, 2015","26":"November, 2021"},"Movie":{"0":"Dr. No","1":"From Russia with Love","2":"Goldfinger","3":"Thunderball","4":"You Only Live Twice","5":"On Her Majesty's Secret Service","6":"Diamonds Are Forever","7":"Live and Let Die","8":"The Man with the Golden Gun","9":"The Man with the Golden Gun","10":"The Spy Who Loved Me","11":"Moonraker","12":"For Your Eyes Only","13":"Octopussy","14":"A View to a Kill","15":"The Living Daylights","16":"The Living Daylights","17":"License to Kill","18":"GoldenEye","19":"Tomorrow Never Dies","20":"The World Is Not Enough","21":"Die Another Day","22":"Casino Royale","23":"Quantum of Solace","24":"Skyfall","25":"Spectre","26":"No Time to Die"},"Bond":{"0":"Sean Connery","1":"Sean Connery","2":"Sean Connery","3":"Sean Connery","4":"Sean Connery","5":"George Lazenby","6":"Shawn Connery","7":"Roger Moore","8":"Roger Moore","9":"Roger Moore","10":"Roger Moore","11":"Roger Moore","12":"Roger MOORE","13":"Roger Moore","14":"Roger Moore","15":"Timothy Dalton","16":"Timothy Dalton","17":"Timothy Dalton","18":"Pierce Brosnan","19":"Pierce Brosnan","20":"Pierce Brosnan","21":"Pierce Brosnan","22":"Daniel Craig","23":"Daniel Craig","24":"Daniel Craig","25":"Daniel Craig","26":"Daniel Craig"},"Bond_Car_MFG":{"0":"Sunbeam","1":"Bently","2":"Aston Martin","3":"Aston Martin","4":"Toyota","5":"Mercury","6":"Ford","7":"AMC","8":"AMC","9":"AMC","10":"Lotus","11":"Lotus","12":"Citroen","13":"Bajaj","14":"Rolls Royce","15":"Rolls Royce","16":"Rolls Royce","17":"Aston Martin","18":"BMW","19":"Aston Martin","20":"BMW","21":"Aston Martin","22":"Astin Martin","23":"Aston Martin","24":"Astin Martin","25":"Aston Martin","26":"Aston Martin"},"US_Gross":{"0":" $16,067,035.00 ","1":" $24,800,000.00 ","2":" $51,100,000.00 ","3":" $63,600,000.00 ","4":" $43,100,000.00 ","5":" $22,800,000.00 ","6":" $43,800,000.00 ","7":" $35,400,000.00 ","8":" $21,000,000.00 ","9":" $21,000,000.00 ","10":" $46,800,000.00 ","11":" $70,300,000.00 ","12":" $54,800,000.00 ","13":" $67,900,000.00 ","14":" $50,327,960.00 ","15":" $51,185,000.00 ","16":" $51,185,000.00 ","17":" $34,667,015.00 ","18":" $106,429,941.00 ","19":" $125,304,276.00 ","20":" $126,930,660.00 ","21":" $160,942,139.00 ","22":" $167,365,000.00 ","23":" $169,368,427.00 ","24":" $304,360,277.00 ","25":" $200,074,175.00 ","26":" $160,891,007.00 "},"World_Gross":{"0":" $59,567,035.00 ","1":" $78,900,000.00 ","2":" $124,900,000.00 ","3":" $141,200,000.00 ","4":" $111,600,000.00 ","5":" $82,000,000.00 ","6":" $116,000,000.00 ","7":" $161,800,000.00 ","8":" $97,600,000.00 ","9":" $97,600,000.00 ","10":" $185,400,000.00 ","11":" $210,300,000.00 ","12":" $195,300,000.00 ","13":" $187,500,000.00 ","14":" $152,627,960.00 ","15":" $191,200,000.00 ","16":" $191,200,000.00 ","17":" $156,167,015.00 ","18":" $356,429,941.00 ","19":" $339,504,276.00 ","20":" $361,730,660.00 ","21":" $431,942,139.00 ","22":" $596,365,000.00 ","23":" $591,692,078.00 ","24":" $1,108,561,108.00 ","25":" $879,620,923.00 ","26":" $759,959,662.00 "},"Budget ($ 000s)":{"0":" $1,000.00 ","1":" $2,000.00 ","2":" $3,000.00 ","3":" $9,000.00 ","4":" $9,500.00 ","5":" $8,000.00 ","6":" $7,200.00 ","7":" $7,000.00 ","8":" $7,000.00 ","9":" $7,000.00 ","10":" $14,000.00 ","11":" $31,000.00 ","12":" $28,000.00 ","13":" $27,500.00 ","14":" $30,000.00 ","15":" $40,000.00 ","16":" $40,000.00 ","17":" $42,000.00 ","18":" $60,000.00 ","19":" $110,000.00 ","20":" $135,000.00 ","21":" $142,000.00 ","22":" $102,000.00 ","23":" $230,000.00 ","24":" $200,000.00 ","25":" $245,000.00 ","26":" $275,000.00 "},"Film_Length":{"0":"110 mins","1":"115 mins","2":"110 mins","3":"130 mins","4":"117 mins","5":"142 mins","6":"1200 mins","7":"121 mins","8":"125 mins","9":"125 mins","10":"125 mins","11":"126 mins","12":"127 mins","13":"131 mins","14":"131 mins","15":"130 mins","16":"130 mins","17":"133 mins","18":"130 mins","19":"119 mins","20":"128 mins","21":"133 mins","22":"144 mins","23":"106 mins","24":"143 mins","25":"148 mins","26":"163 mins"},"Avg_User_IMDB":{"0":7.3,"1":7.5,"2":7.8,"3":7.0,"4":6.9,"5":6.8,"6":6.7,"7":6.8,"8":6.7,"9":6.7,"10":null,"11":6.2,"12":6.8,"13":6.5,"14":6.2,"15":6.7,"16":6.7,"17":6.5,"18":7.2,"19":6.4,"20":6.3,"21":6.0,"22":7.9,"23":6.7,"24":7.8,"25":6.8,"26":7.3},"Avg_User_Rtn_Tom":{"0":7.7,"1":8.0,"2":8.4,"3":6.8,"4":6.3,"5":6.7,"6":6.3,"7":5.9,"8":5.1,"9":5.1,"10":null,"11":5.7,"12":6.3,"13":5.3,"14":4.7,"15":6.3,"16":6.3,"17":6.0,"18":6.9,"19":6.0,"20":5.7,"21":6.1,"22":7.8,"23":6.1,"24":8.2,"25":6.4,"26":7.3},"Martinis":{"0":2,"1":0,"2":1,"3":0,"4":1,"5":1,"6":0,"7":0,"8":0,"9":0,"10":1,"11":1,"12":0,"13":0,"14":0,"15":2,"16":2,"17":1,"18":1,"19":1,"20":1,"21":2,"22":3,"23":-6,"24":1,"25":1,"26":1},"Kills_Bond":{"0":4,"1":11,"2":9,"3":20,"4":21,"5":5,"6":7,"7":8,"8":1,"9":1,"10":31,"11":12,"12":18,"13":15,"14":5,"15":13,"16":13,"17":10,"18":47,"19":30,"20":27,"21":31,"22":11,"23":16,"24":26,"25":30,"26":14}} \ No newline at end of file diff --git a/james_bond_data.parquet b/james_bond_data.parquet new file mode 100644 index 0000000000000000000000000000000000000000..88bd22b4fb36adc606eaf6eacd5b46d56121e5d1 GIT binary patch literal 11050 zcmcgyZEPDydY+Y*HffWwBx@DYk?cK1W)x>7?FYZsa9k}Zks?D%7A29CF0HU6R}`h? zE{`9Q=rl!8^om?AIQRnR;5!tDG{}#jz4S+s25s6O*Zw#J2M6en^iZHd5geLJa43qP zD1stro4zxoBub?6f!HQIyEE_1ydTdy^Ugaf-%yeQLo=_QVt(5jXG8`KAoRoDr_UgS zVs%5~SV8uQc4#T}LcLnE^bJckXeHE!#o_UsgX)r2u4$#LRiYTDcUdi))DQOCMV$Op(gs2a;2zSCRo1%rY+=ZrlUmrW!t3f&*^|b zO`=5qx9Na%brR75@5tYxfF(W{Q;oV>vHX}0$icgamc6!h+uKNXykjfTvOM`W$S(4{ zNd31xtYs^87VT1QrRUsWh*iUNa}0CSCFd`us6ia9mx zL*wTTOygJ>>2;nZ3S&(Ce$aSW?}4X}ve?+?l^0~|<-f{Q|Ksn+AA94|EYRI=T80_B z%ZpC+xtynyny#r;jp-Tk4SjHW=w}x9OSIeikJ-<;j3R>kXw3avPMqaCA zRb4+1%~i9`E*N@)jTd(`c6qm$(=Gvd!C>dK3Y$>3v}&#KSe0GUauuxxugVS#8H6D- zMYUua`6{baG&XEjv>mO|2Xsr6SB*RyhtV@d?GZ%P`!rx=*zVYCn;J{qc4`J2sT+fi zOU|WogH3OmmbYf|Y(gVT%ruRPx~)|Ps1G=^!h(JkcDA8aEpJkfmhoySgNY3+NMTb& zZI`W?tja!B)b;+SH^B%nl$}uqxQ{$xRI;I$i4t3|_PlV(HJ zwSFKh)>WfcFR|tZyJYHWPCE_6rR~NBfK94pEmx~(&F#W27fTw}9yoVdhUdF@#@Tn= zX5jxN>+!vFhwIUjFn2&0UX8EYhCZ>nUwO#=Z)DMbMQL5vRJNKuU8s0 z42JEN@1-Wj6Cn$<;%2R0r5<&LinWTV8Ps4rsBWoSO=NYwidSe(qaizMx(N|&>l+Pe8@qS@?8*~Qa2ycbf8;uZ$&O)K52=;)L^wj= z`Wb=iMW_4M42J6n+pg$*cAB|E?+3U4?TzCN&W;H_j`#UFUh+=zJZl>q6Fppj=Q*C| zU4wS7zzNp}k=-Y8K6uj}DLjs7Q4HT8b?A}2oaZ{UM1gvqqFpc29$)wxvio^Xx_%AO zUJp#Lg8(@=j)s9KLe)2tr#^wB$G3-A$;0`ueBm;O`X8Sj<9(b*lsVZWP=D%R#zZeC z@g7d}`z9IW^l`Gpb3R+aab?WM!yu6pBzfe@AaIa;-u`F%udp)I0>H!Xxo>s19|*B!KTr0>dA(hYqx^ zEUiaiUsZ|oeqkH>*y8@xMfXoHVCQ`$ab2DIBP((ZIeusl zfWuvn+n7F4z&>V>Rgk@ySHRDCffwr)IUn)zm?B$D-4?!D{4E3L?0Z?5`M_n z;o+UUhETwF5n275IC&8vfB!|=jx2ky7a$Hu%=c_8Ee3onY-E3b2^ta)4dl7rhnQd5 zc;sVP+JP+3A>n#Iq9t#C(lP1^IrmAf##ii>Z4a)8`^&NN#}>J{nPJ3ulp}14k11ltmg}m1dWfeV0ra%vbEkiJ@i?x z`!?r(^KlIG$)dU<`^#SDJveFI1lbQM&I&RGfoDraqiVAwugi|4Nhjdva?&2rVn;UW z8ZGi8mmvUJT$U&jN20Xo8*z?Aku<{7lH?kr1=(c>bm8Ig00Vwx8F@yoLE)V~iLma1 zkyU8*Ijs}AYo4UX!!Q+fvoLUKYE6bVRU{g9HMhJT+~bJam^^yHVHESMGyU9 zuls%8{qZD5rkjASn0!Dnw*Zr2f=L-k7Rov@jVu&>aoTc@mM%H^?8DPP`7uSIM;uQ3 zBh&BRh3W-|6*(Q_@b2s9k;{P#JLZIO_;+dgxW!KEevcDl4nODgP|tW5Ds&C&_>^ zdjD^Lf`e0BPHm+3HtxLrz8fFb?oYkL%$e2&!J+IF)>o=pWj&gh2@)LsvUlh=J?@`- z+_$9RuOYT8Hm~$Be+Sr%5o~TzG`8J(k)Xp)+xo2RG{6P8SiyX!{Q`l>`tbAy5C+qtk5dr$udg1W)^~tfvSzG9B2R$(tas=qkfF)$cl-cc zryT6BPAcjO+zloWXU-7V#_1lo>>)%s&Ry#TP4Mrt&znCm`bg60wC;~^TKC5c!CLO5 z0+Yk^^ZC)s-@8rU8ha6rB=?)whZ(xHJgm(mu-H77f6z1Z8Rh=LlkT^#fq|VoPmt8r zz<=yv{sC6T6=LG+6iq{@Kxb{I;SH-{5DEju0_8LmD-;UKB`9_%15nOEaX>i(WeUng zD0n6HIjq+%5R?$YnLqw@Zl|ec2Xa~gTJpo%cT%CNOBz#zkh^s2d=h%yw?l_`ox#_@ z>iR*_{wh6mr`P?jiu-?l2N3JzeuB(yh`HYPBIl`UZc{6%j1w{T%RL9z`^zXk45|=X z^NQ6@A$N#&!jWvbz2hJYYbUm!)a~@+$4h%(dvM*Sp8JDDHkVfG--lLr%n zhkiOc68Hx(zg_jaF3n z@;S<}qlQ}2fMC>h9a_~Id|!bkPwHkFs&FmDbPd;qXMYzOO@`})yVl}+l+6ZwfXc5Q zr8V_huQke8-e}z@=FEIkg?g#nIOs65Tkts$t5(u#YF@3WF#MTO&4A(X6%)^0Sq@JG zMxSj{=BO58a*yi?dd_1??c|A^IH zW_$0wmA(J{rBiNu&%Kq|_6eXg8O zWp{KRlfI#wYR%`t6OAL$q^+YZoOEp81OBb6(-&yaHsT&Dv45@U6yDx*OlZ;QYWD&8 zi>yAp7j%j|+V>5Gl2fCisWr1gg1KJIR@6!Z$MSMR#eZ+Y3cw$nszuYlRKYXJPYRcm5*~o4+8j6+FbgH@*mg}o&LC52=Myi?(0Usj>dVy>~=Cw?a^hJ$ST}{vLtVLFe zST>BU<>vH_{G7f^_=aLTtEIBODlInq(6`*Gf9HlN$#%gO;>Qqr3P48fBa%w9dBK@_sOmY)=@T&)Y+>_%WjP0S!#gbfKYmV>mZ*)7hlPxvZ#=npKoNP`I{%{mrBiVB&FC+PD(sbf2Ksaf!BJw z>@Mp+la*q4FVE#l^37a>td~|yns48uLbjxDWyR!v%n`DgptpTZX2sZd*3$EQ69+-h zN5T--lA8zdVUDjx=i1^)C>F}6lg3(RvG7!Av5=Tic5{twf>9SePaON8)?>1ySF>ba3del z>uToYeGX^Xq1?`Mu$(o%9*#>QtjW3gjW3SfwQ@GHcQeC(aSZ>ay0!9%##%beXQgQC z?A|M{LavfCw#m5#=RVY9JNagu5n=t^RMT>uoaDkKe$L znf?4tQT#WgsVUrCj1sOmkvmNFU=~i@W{=>gD2bamX~YAAWKwA6iDr8?1kzv`J$`7oAq3wHqDIf7}q#|++_b3T} zOTVb1-+(%?FF};TiEg`nGnhKtw$FHdJ~v&VJ0!PGc^T`U@%EYIs7E5 zkfsvHuVb-N*~Rjww)X7_H|-%4+VH_x2D`<1C4lEQwp#YgHs=Qq{YN+{RXqRbdA6(M zVA53T20V!UE5!Z||6&H=!?T?5o5k{;Q1E=rxMeRvqRn2i{*LD$Wd0nPzk6&xPWs#N zBKDK{WKyVr$BW1GjgUT~I;oKbGTY1{F`S}ApjatE zKA#TXTwBB&WwE8-lTRWiTUIOIscW@~T)8|!E?jKuLd7<(TCThBx0)~ghyK%r&<*%s Gvi}c^zcEVy literal 0 HcmV?d00001 diff --git a/james_bond_data.xlsx b/james_bond_data.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..1e042705fc38619b41b62bd98b1c1a807c061cc0 GIT binary patch literal 11269 zcmeIY1zQ|j);8QofW}>eLvRi5!QI`1ySqzp2m}opg1fuBy9Em#92$3d`<$8i&P>jH zzu=kb?%G{_?R#BaweO{SDat@WV*+3RZ~y>+6kv3kX`v4R0K`E902lx`NF5P-I~P+s z7XwvK2UBM~Mh{yXk{oDA>TCccc>e#6|HDt9EOAu6ml;*;R`N+?he3L^MhK4mBxn$w zR!N|%FJYj>SSQQU@--{s5mop*mKA3;`sAuN>-o4vjh$^nQ20QrGGbKVP?wrM0Vh-c z;4x(<4qm*IwiYNGlR$_C&%hwYEFJLGv9VW~O?2~vbQ!({7PdgZcINV+I@Ucbe%rBK3<*FpQEl6A6huDEDRWQ zvV;OjgJu>1iq=te*yXHtlfHd|Sh=#JXFVi8A1OC6-uA5%n6zZY#+k_DlG)O4=H&D?joh|4LjKwI^gv_p8BN| ze0rXI;LrL30|30fLID*2LoMson8~leT$2TB9RgS_4V+AEoS7K^IR8%_{}1cnzukIS zyqrQWGh!%6@+oZScK&A^s<4dv2gz1aRlfkKRh0UeTnd7pom6j8Rq=zNB>da`o<>%F z^2Hnuk=|^vl|`Xr@{%{Wmxrc2IJ&~pzH>?zcP!iNLvx$Ip1(9$MANnSGcrneP81x!?dv zFq~WVcqSEV$l1tzslsQ-hV=R#Uroh=+p@|i%ZZ1~)4;^0>rynm6XV&3K|W(hnT!qn zj&)LEkTUz)U%QU&d^FRuj}zEiI&?N16h&OL2KG>YSCU{9iLi9Ag1~`Q2nGCPJZzZU z?VYTR?Cq`p@LPGRathnbsO^8Z?4wkPh%3cif9|-DXJJhq`D7-2 zfotE~y%!8xpIDF~Ty! zsiS`KlMm@*7tRw5`kpM)eWzK?x;L{xl4d>!>6;MQ!)C$#2&XH5ZLH-X`|FJmz398+ z2FQMD^Jx>xq=G2Kp0zl}Ws4>e4@rPXDK}78LgH7DrRm&E7NE``K_K!Qv;ZkA&Eng= z4P!|{gG;JSt0wdOUWFR?Tg^>6Twq7ExR7>|I+^4e)pU!u9RnYiPGt1@j~TH#YQO1K zHQHY4+yVV}Jr@)M{LJ%d#BEciEVN=UYwC;ku2q7Iqn;(*0m~@wN9*&F4X7mwM=$XMc zNLr{p;b$yQplZU6E-T@e`%BBk(*!rpE3d-dtRDyN;`*IpE&8BCr@52sux%MuuRuc= zToy5z^wY@WY~$6yNTTM7gM#xo5?Zkf>22;sy{z{Do8#rB_6_SxT3GX(@!+@! zZ4o#aBFWV0R4#mLQ@PGR#Qo7AW!1yOFWHWc7C5w7m~0c31WYmc;{a^sjX_DO&8(E5t`1G z3;NM3klD`g;QTsxrT^U$AYM*Nex5=F|I6H$*IRiO9w;;8Zz`#5Lq?HouJ$btF0KVS!MVu&rY1R;b(a@ z6Q~8L-1qqIv?wSHg5p^2@`jah5cv~vVX z$`JMm%#J7%x1#%*m#;B+I1)4Yucge6-?SEo90I6RFFND5y1yi47`FTOgi2^GGNxCU zW=(_kL5CZ>$Z{@L%gw|*nLFqTTuo%ykjR`@DbYM|Zg>dfy-a7QUn6sdPyoP&b+A z8peW|f@1s@Wdm9q1FLlyP~7?F0{(eQOk1%#;y@MiryTMoRB7*<%g{f+Q`=1tgX)gb{gF+h8r3-$CPBr^jx7l zTRPN;DWE9X6(s3iRZE{QdoNX1g&UtGqT4&4D)rQ2Gd)aCkZ;rzI&c9_wBh&-nJF@4 z>N!mr_dMK=DtVkrjF704=HQTXN%L}Z0}L-Yaalr1{6%_Bmp4df!Dn{)Lp{y#r!*(}d^(ud zC$fa1=#DA&La+Rcqk41rY}<47;KDsqH+JJw3B}1>oqG|p=;_Hht?%n&Pat?}Q_u@> z4jg+<-drTvTsRp&fzl^@r95-;$U63%rVMI`r%vkb$#C>BZlw|0wM(=?3V7ik)M7Ko zA==)3NYfWLkM|59??oNA2eOm*1F)t5fVhc?IOK|szd{OPl90kVT>J!r|7XD!27 z0)C9Y*pM#dCT5N=$DT+4aCryu6&AHiJjQz5+N=9f~>r7A`t)7Lpu=$57u4o`bx1cC`CCI!YXR*TgkCz)JLz)j3BsX@1K5 zb|*C6TiejiDli7$a?i{pny!rXg62czg>g*6bW&Yzu0! zjJ(;~dj0ft%{m&)tEbxZvYMW@_=^SIUl8dRBHB9a$~N;;668xdeE7}F5XM_afnUVI zD^QfZqJkWahRbgBQGj1?Ojlu+$>8N{+5tjdkih zrgHlf_cCpv|+g2ewuU$ZSgw!Ffv7o7JpHOrOKoMDOwXJ!4{>{o#_Xf@e0W6 z(fWipDM|Mg6;F4?=e51mLxhro67I&_392zjwAjbkR4lYA`j=@|0^%A{vt)IU1X`si zBr(L##PTwA+L$EFS$ZKn`M~3o&A3qLYxZG7XX&=l4Hgw;rcbhOEoSn7-bSCD+naB`HYP+hr*8l#S+fQDiw zTiULDIdT)-0l{1!e&p7gY3C|F32?v-Do{^m9oM#RU(TAL4?O-Mwwx*5uIzkH`CyH} z%DHml)~+>V6oDW2tgX{ksexZ#t_Eu_gl%BZ^dUATztHrK(WVy zLtDr??8fWOHJ-`P(r26wC;9>E(k1g)4OA(Z0?!-`Y7^+71khj1$M{20f#_&;5zHzuz;vbY3XQO?E@0*sZ$4~X?CNr{#vI8fKnaf|{bEbD{7VZZlxM5S=3R2!`db|>L2pjk;c?p=dAIK?MmVcJB zylF8SqgLH^@AvTHvF)DTk%`E}p!C2yHPG~EJm*>=M?H#yoosD+tT=^vy>DryPnp!V z-Ft(bw&Q7F)3W1YnZTD9O;`jZ=gdDypnQw1gn}&xWna*=G?hqoR!@D3%!s{s{2{0` zwn!QoJwqf`kDc6r9$}_hlskoaCDru3PdrB5IG5D-agKzKdXtC^M3QB0o+c?jMrt>s zqf;@B-}}qsm0LoZn>M)pI{B(rt|~LWTcXaMld8N|#G!)c(^P~{KT!tMg9}=FQ??p_ zrZ@CFy%jG{-nm35%`)BjVsk*Pf=f6LlkxSYfKo=DS#P)~{4_sJJMZ>&xZ=<~k4=rgv3iBjJ?~EZ=uZ)_ z(>TGd!}$s>sr)^A4oL5TBZ2||u3!NGq<^?2XBST!Q|CX+hH5KzE6j+##8v*NSEnOx z+w?#i961%JH!UB9cnvEf8K{I^SxoYsd5+g5?&F&*8WERd$du-$Nl~N*HDWOLQ&ZEd zlQuSb2E&VWadd6bpH5gV&2Z9bHG^);1~d)^Q5);F5cF8yP~~&PYcHlD71Q-vD7DHc$4%W9iYiw)gjL1ecLqpWWw|A#SZgVb{?K)3 zwXbH5zu&UUt@~~bp_*^ujpf{`F)Hbt-l-1G;O%#Vqq~pA@5wv?UPx2gVfN$e9LKX0|M*ikzlOsos@}|N*L-eQJQu3gqKf|54 z)IxI}3!8MKn2zlog03rJaxZr<1hnzO#j2K2V#7^Z%_N|YPx`pDjtKpe?mER?6hV=Z zsB`;vkJGihG(@e%XR`ZKxuZ~Hy7f)0sS-oVpTI{CF4;53=Ts~osy*1s$|tQAY_jgb z>-M84!^1VhnD)x=UpI+Q5EafQSAG@VA95hHr*>v|CdgoWURpF?or>;tuwR=8$K6`2ro~-+Z=|kKHpwp9XVdsCX zz+-mRYITDH%2a#+0P~-UWbx6-)I`O_$cqGb#^-KyU98!IH`qLZ4I0$;7MY!rq z+*Qs%JX7KllBNF@8Ff9pVdr=6%ptU_Se+Asd2CW;>~&`M^z1&UO#tm4B7kQTz0^U7 zeC^aLT@t&K4D`65R+B<{Dq0X}SBtK=3T08j?xjW?uFBIzoKjZr$w1736DUA`lIfqg zrQ)ItKCj;0Hh$k>`X)Dx5M}(%LzDO_xppK)QBR_Dv!IAOe6iDBI=jwh%*Jsaop32`@;_UQ!h8m(W08^ z%N>eYbsdi6{C|mk$I)~2FM~!vosgP)(ZM-{(;K}xh*k=m$VZhuDKk&HaWA8Zc%y)|&TSsu-Xm!g(K+2rLzp zI%+Q%v8{kR*rW}>*GUY7=Grnx&W=imn%9A}2IFTH^?h+nUbamI#PM(UGd;Jur+MG` z-+$=~OT^iRx_~|2IJ~OE*dQ)7ial$~3?T^g`v@QK6k%EW_#qchhc^2P?@U8NyaLw( zR|MmM69WbNy?m;+I`&5%ASq7(YE3~i|Ca>rFA1W^!iuAEKgnnsu7YOKMOAZE`ls?$ z5#I*qqY^wEC!#dXCYx%60+e48Bdq(+sg_$vdHbOzI=>SY=M%}v1gZ#Q1k8lk%GpsW z(xypEeiJL3z05+wi4E%e<^?Ap8kzx!JS#;?(T-(cophEwxuPBbgUUyEJrqsoL|PLq1NS_{pUr*53w;KylQsrQ;7tSzE_gVM3tbjlG6NQ@7Puw&_n3Emfq24=~jIfc}V5h%0=_tesXj}kWPg6V$`<$ z?&CUKqRz(JD2syu!F;EqLnC5lcckc)TgagK>|>VLASp1O8s!37UACPQlEq~3728qc z*_3R^wr*6Jmh#)QtF5i|?Se*6w;13()rUm%Z$n#N8RzY9c#&=szje}jHiuj+kFx17 zUVZ3g9m>8HRzJF#7+H!#13G16ZT0K3;7-pyV#CZHEnkQ(l?o90w6#BNA!fppRJ2Uh zf!@JFl{s1Pz1<^nmG8>CTkkoA=`=yHn$*3!zb;^N7lt{ zj&p8bm)5A%H1VEv{MP@#C6AMWw^-To3PBbwpPrduaa}ZLcQXBFyU_pksKCB3I$qB@m>Dw^1p9~*?;WG0 zzL1T5ql8<=tT{MvD(qxs`Yx0r{d%3r>WF@&pK<(g_nbS-q%^oJn2EW;O7*iOW-b8J zUNnE?ZM)5nO?LP)W9h0ErMO&VdTp!}9K0WioG@9kx!q0hHXBkS$`#Ia?(!ml(Ppl zSgTSv#)p3TM6os?lg4P_b-nibaO}s#MKqM#^%^0QMsMDSk&W9nNi22_j?4cZxC3Ds zNTR^x!v<3m_20?&(ZS(=+yj&DpGQXgI5>wv{RDdRfD8#@`Beu8A8e#Z!Qf<}1f?E- znXIiPO0B$M2i>Y*pe$_Dd{W%_jE5+QTx^HZPA4915?tBmr4|X<4ve{el;&G7s1%3@ z!u5e-^eF2?lUhBz1EV&&hKEgtsmM@4+9GJ zJejU-V;m)jfE8aibY{;$>_2Y8yZ4t%H)JK|pC2TJuvUYG(O4mJE#7HX`WDu$1(jh} z*{Op86_Omq1RE?r{7aegkTC6s#r;?Q#B) zFwMBUgk%ZvQp_lYPQ`i)H@W%>L8`j& zwcUvE*eT)lreRmPKZSSvQ7cpWFnY&$w~txZeRkvRTby_fP(bzzom1P2Vcv#5qo+t!P-re1yhvcs2Q3x5U75ReSu z;P}5cT>fjp{x$xW_De;X{}k|_tzrKH{xPP2wc_6z#eN6=-Y)YOv<+OM{jGWCckqAq zDEtKl06qc#1pohZEc`C#_l@hnBsC)Ze}Cd{+t|NL`Ms|EmlS<)8UntR->b~O3;4YP z@RxvVa4G{9@K-(Hcj)gq&tFghqJKbt&wzfH@SiU9FFXLiLID8$&7uAd|4$?M@9>Iu c{|5i3NmP`1172MK03Q4b0Q+ovnm?}o9|Ua)9RL6T literal 0 HcmV?d00001 diff --git a/james_bond_data_cleansed.csv b/james_bond_data_cleansed.csv new file mode 100644 index 0000000000..04d4f213b9 --- /dev/null +++ b/james_bond_data_cleansed.csv @@ -0,0 +1,26 @@ +Release,Movie,Bond,Bond_Car_MFG,US_Gross,World_Gross,Budget,Film_Length,Avg_User_IMDB,Avg_User_Rtn_Tom,Martinis,Kills_Bond,Release_Year +1962-06-01,Dr. No,Sean Connery,Sunbeam,16067035.0,59567035.0,1000000.0,110,7.3,7.7,2,4,1962 +1963-08-01,From Russia with Love,Sean Connery,Bently,24800000.0,78900000.0,2000000.0,115,7.5,8.0,0,11,1963 +1964-05-01,Goldfinger,Sean Connery,Aston Martin,51100000.0,124900000.0,3000000.0,110,7.8,8.4,1,9,1964 +1965-09-01,Thunderball,Sean Connery,Aston Martin,63600000.0,141200000.0,9000000.0,130,7.0,6.8,0,20,1965 +1967-11-01,You Only Live Twice,Sean Connery,Toyota,43100000.0,111600000.0,9500000.0,117,6.9,6.3,1,21,1967 +1969-07-01,On Her Majesty's Secret Service,George Lazenby,Mercury,22800000.0,82000000.0,8000000.0,142,6.8,6.7,1,5,1969 +1971-03-01,Diamonds Are Forever,Sean Connery,Ford,43800000.0,116000000.0,7200000.0,120,6.7,6.3,0,7,1971 +1973-08-01,Live and Let Die,Roger Moore,AMC,35400000.0,161800000.0,7000000.0,121,6.8,5.9,0,8,1973 +1974-07-01,The Man with the Golden Gun,Roger Moore,AMC,21000000.0,97600000.0,7000000.0,125,6.7,5.1,0,1,1974 +1977-04-01,The Spy Who Loved Me,Roger Moore,Lotus,46800000.0,185400000.0,14000000.0,125,7.1,6.8,1,31,1977 +1979-10-01,Moonraker,Roger Moore,Lotus,70300000.0,210300000.0,31000000.0,126,6.2,5.7,1,12,1979 +1981-06-01,For Your Eyes Only,Roger Moore,Citroen,54800000.0,195300000.0,28000000.0,127,6.8,6.3,0,18,1981 +1983-03-01,Octopussy,Roger Moore,Bajaj,67900000.0,187500000.0,27500000.0,131,6.5,5.3,0,15,1983 +1985-10-01,A View to a Kill,Roger Moore,Rolls Royce,50327960.0,152627960.0,30000000.0,131,6.2,4.7,0,5,1985 +1987-05-01,The Living Daylights,Timothy Dalton,Rolls Royce,51185000.0,191200000.0,40000000.0,130,6.7,6.3,2,13,1987 +1989-01-01,License to Kill,Timothy Dalton,Aston Martin,34667015.0,156167015.0,42000000.0,133,6.5,6.0,1,10,1989 +1995-09-01,GoldenEye,Pierce Brosnan,BMW,106429941.0,356429941.0,60000000.0,130,7.2,6.9,1,47,1995 +1997-07-01,Tomorrow Never Dies,Pierce Brosnan,Aston Martin,125304276.0,339504276.0,110000000.0,119,6.4,6.0,1,30,1997 +1999-06-01,The World Is Not Enough,Pierce Brosnan,BMW,126930660.0,361730660.0,135000000.0,128,6.3,5.7,1,27,1999 +2002-08-01,Die Another Day,Pierce Brosnan,Aston Martin,160942139.0,431942139.0,142000000.0,133,6.0,6.1,2,31,2002 +2006-02-01,Casino Royale,Daniel Craig,Aston Martin,167365000.0,596365000.0,102000000.0,144,7.9,7.8,3,11,2006 +2008-12-01,Quantum of Solace,Daniel Craig,Aston Martin,169368427.0,591692078.0,230000000.0,106,6.7,6.1,6,16,2008 +2012-11-01,Skyfall,Daniel Craig,Aston Martin,304360277.0,1108561108.0,200000000.0,143,7.8,8.2,1,26,2012 +2015-09-01,Spectre,Daniel Craig,Aston Martin,200074175.0,879620923.0,245000000.0,148,6.8,6.4,1,30,2015 +2021-11-01,No Time to Die,Daniel Craig,Aston Martin,160891007.0,759959662.0,275000000.0,163,7.3,7.3,1,14,2021 From 04f5a12a883bd694e2b413113afb316c71925124 Mon Sep 17 00:00:00 2001 From: gahjelle Date: Wed, 15 Nov 2023 12:34:23 +0100 Subject: [PATCH 2/5] Move data analysis files into separate folder --- .../Solution.ipynb | 0 .../james_bond_data.csv | 0 .../james_bond_data.json | 0 .../james_bond_data.parquet | Bin .../james_bond_data.xlsx | Bin .../james_bond_data_cleansed.csv | 0 6 files changed, 0 insertions(+), 0 deletions(-) rename Solution.ipynb => python-data-analysis/Solution.ipynb (100%) rename james_bond_data.csv => python-data-analysis/james_bond_data.csv (100%) rename james_bond_data.json => python-data-analysis/james_bond_data.json (100%) rename james_bond_data.parquet => python-data-analysis/james_bond_data.parquet (100%) rename james_bond_data.xlsx => python-data-analysis/james_bond_data.xlsx (100%) rename james_bond_data_cleansed.csv => python-data-analysis/james_bond_data_cleansed.csv (100%) diff --git a/Solution.ipynb b/python-data-analysis/Solution.ipynb similarity index 100% rename from Solution.ipynb rename to python-data-analysis/Solution.ipynb diff --git a/james_bond_data.csv b/python-data-analysis/james_bond_data.csv similarity index 100% rename from james_bond_data.csv rename to python-data-analysis/james_bond_data.csv diff --git a/james_bond_data.json b/python-data-analysis/james_bond_data.json similarity index 100% rename from james_bond_data.json rename to python-data-analysis/james_bond_data.json diff --git a/james_bond_data.parquet b/python-data-analysis/james_bond_data.parquet similarity index 100% rename from james_bond_data.parquet rename to python-data-analysis/james_bond_data.parquet diff --git a/james_bond_data.xlsx b/python-data-analysis/james_bond_data.xlsx similarity index 100% rename from james_bond_data.xlsx rename to python-data-analysis/james_bond_data.xlsx diff --git a/james_bond_data_cleansed.csv b/python-data-analysis/james_bond_data_cleansed.csv similarity index 100% rename from james_bond_data_cleansed.csv rename to python-data-analysis/james_bond_data_cleansed.csv From 085ad86a93b87cf21c2d3329323ea1f0a19de77f Mon Sep 17 00:00:00 2001 From: gahjelle Date: Wed, 15 Nov 2023 12:34:38 +0100 Subject: [PATCH 3/5] Format notebook with Black --- python-data-analysis/Solution.ipynb | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/python-data-analysis/Solution.ipynb b/python-data-analysis/Solution.ipynb index 356b6682e7..afaa57346b 100644 --- a/python-data-analysis/Solution.ipynb +++ b/python-data-analysis/Solution.ipynb @@ -200,7 +200,9 @@ "metadata": {}, "outputs": [], "source": [ - "james_bond_df[[\"US_Gross\", \"World_Gross\", \"Budget ($ 000s)\", \"Film_Length\"]].head()" + "james_bond_df[\n", + " [\"US_Gross\", \"World_Gross\", \"Budget ($ 000s)\", \"Film_Length\"]\n", + "].head()" ] }, { @@ -242,7 +244,9 @@ "metadata": {}, "outputs": [], "source": [ - "james_bond_df[\"Release\"] = pd.to_datetime(james_bond_df[\"Release\"], format=\"%B, %Y\")\n", + "james_bond_df[\"Release\"] = pd.to_datetime(\n", + " james_bond_df[\"Release\"], format=\"%B, %Y\"\n", + ")\n", "james_bond_df[\"Release_Year\"] = james_bond_df[\"Release\"].dt.year" ] }, @@ -290,7 +294,6 @@ "metadata": {}, "outputs": [], "source": [ - "\n", "duplicate_movies = [\"The Man with the Golden Gun\", \"The Living Daylights\"]\n", "james_bond_df[james_bond_df[\"Movie\"].isin(duplicate_movies)]" ] @@ -361,7 +364,9 @@ "metadata": {}, "outputs": [], "source": [ - "james_bond_df[\"Bond_Car_MFG\"] = james_bond_df[\"Bond_Car_MFG\"].str.replace(\"Astin\", \"Aston\")" + "james_bond_df[\"Bond_Car_MFG\"] = james_bond_df[\"Bond_Car_MFG\"].str.replace(\n", + " \"Astin\", \"Aston\"\n", + ")" ] }, { @@ -446,8 +451,8 @@ "source": [ "import matplotlib.pyplot as plt\n", "\n", - "#x = james_bond_df[\"Avg_User_IMDB\"].values.reshape(-1, 1)\n", - "#y = james_bond_df[\"Avg_User_Rtn_Tom\"].values.reshape(-1, 1)\n", + "# x = james_bond_df[\"Avg_User_IMDB\"].values.reshape(-1, 1)\n", + "# y = james_bond_df[\"Avg_User_Rtn_Tom\"].values.reshape(-1, 1)\n", "\n", "x = james_bond_df[\"Avg_User_IMDB\"].array.reshape(-1, 1)\n", "y = james_bond_df[\"Avg_User_Rtn_Tom\"].array.reshape(-1, 1)\n", @@ -475,7 +480,9 @@ "model.fit(x, y)\n", "\n", "r_squared = f\"R-Squared: {round(model.score(x, y),2)}\"\n", - "best_fit_equation = f\"y={round(model.coef_[0][0], 4)}x{round(model.intercept_[0], 4)}\"\n", + "best_fit_equation = (\n", + " f\"y={round(model.coef_[0][0], 4)}x{round(model.intercept_[0], 4)}\"\n", + ")\n", "y_pred = model.predict(x)\n", "\n", "plt.title(\"Scatter Plot of Ratings.\")\n", @@ -502,7 +509,9 @@ "metadata": {}, "outputs": [], "source": [ - "film_length_groups = james_bond_df[\"Film_Length\"].value_counts(bins=7, sort=False)\n", + "film_length_groups = james_bond_df[\"Film_Length\"].value_counts(\n", + " bins=7, sort=False\n", + ")\n", "film_length_groups.plot(kind=\"bar\", title=\"Film Length Distribution\").set(\n", " xlabel=\"Time Range (mins)\", ylabel=\"Count\"\n", ")" From 022fbe3b9fc623c24a2563c904a7ca955fc2a7dd Mon Sep 17 00:00:00 2001 From: gahjelle Date: Wed, 15 Nov 2023 12:37:02 +0100 Subject: [PATCH 4/5] Add README file --- python-data-analysis/README.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 python-data-analysis/README.md diff --git a/python-data-analysis/README.md b/python-data-analysis/README.md new file mode 100644 index 0000000000..4284962772 --- /dev/null +++ b/python-data-analysis/README.md @@ -0,0 +1 @@ +# Using Python for Data Analysis From 87e24405841d35a380dfb91bd51039a52558beb8 Mon Sep 17 00:00:00 2001 From: eyrei123 <88923476+eyrei123@users.noreply.github.com> Date: Sun, 19 Nov 2023 15:38:54 +0000 Subject: [PATCH 5/5] Delete python-data-analysis/Solution.ipynb Removal of solution notebook. Now obsolete. --- python-data-analysis/Solution.ipynb | 576 ---------------------------- 1 file changed, 576 deletions(-) delete mode 100644 python-data-analysis/Solution.ipynb diff --git a/python-data-analysis/Solution.ipynb b/python-data-analysis/Solution.ipynb deleted file mode 100644 index afaa57346b..0000000000 --- a/python-data-analysis/Solution.ipynb +++ /dev/null @@ -1,576 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "83ad2114-5ed8-4a90-85fa-adea5eda4392", - "metadata": {}, - "source": [ - "## Reading Data From CSV Files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a360772e-7829-4c15-9af9-d4596efc7351", - "metadata": {}, - "outputs": [], - "source": [ - "! python -m pip install pandas" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1148ca58-9a4d-42ed-a43a-e3b8b3359c2d", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "james_bond_df = pd.read_csv(\"james_bond_data.csv\")\n", - "james_bond_df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "e47c1f9b-b390-4035-956b-622615b57f32", - "metadata": {}, - "source": [ - "## Reading Data From Other Sources" - ] - }, - { - "cell_type": "markdown", - "id": "47a0e4a6-0ed9-4253-9833-0ad22c49b968", - "metadata": {}, - "source": [ - "### Reading Excel" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a0364b81-64a0-4098-89fc-e58bd6d68257", - "metadata": {}, - "outputs": [], - "source": [ - "! python -m pip install openpyxl" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8302139f-52dc-4f95-aa9a-96040ae5d82b", - "metadata": {}, - "outputs": [], - "source": [ - "import openpyxl\n", - "import pandas as pd\n", - "\n", - "james_bond_df_excel = pd.read_excel(\"james_bond_data.xlsx\")\n", - "james_bond_df_excel.head()" - ] - }, - { - "cell_type": "markdown", - "id": "1d85aee9-cfeb-460b-9fe8-f3c7e7dfb764", - "metadata": {}, - "source": [ - "### Reading JSON" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7465cd11-dad4-4741-9372-f825b28c33d6", - "metadata": { - "jupyter": { - "source_hidden": true - } - }, - "outputs": [], - "source": [ - "james_bond_df_json = pd.read_json(\"james_bond_data.json\")\n", - "james_bond_df_json.head()" - ] - }, - { - "cell_type": "markdown", - "id": "69f884c2-92e8-4db3-bd63-84007f654808", - "metadata": {}, - "source": [ - "### Scraping HTML" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b902722d-9648-4124-80b0-64004342170d", - "metadata": {}, - "outputs": [], - "source": [ - "!python -m pip install lxml" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1fb2ff9c-3030-4f4a-be30-c2ab68452a21", - "metadata": {}, - "outputs": [], - "source": [ - "james_bond_df_html = pd.read_html(\n", - " \"https://en.wikipedia.org/wiki/List_of_James_Bond_novels_and_short_stories\"\n", - ")\n", - "james_bond_df_html[1].head()" - ] - }, - { - "cell_type": "markdown", - "id": "be4a1143-c966-4056-8a5e-3bdebe2a9b1f", - "metadata": {}, - "source": [ - "### Reading Parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f36ef600-e6ba-4cc6-9ee3-0cbf369a4be2", - "metadata": {}, - "outputs": [], - "source": [ - "!python -m pip install pyarrow" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bd6f496d-aa94-43ce-9e97-6f01108df47b", - "metadata": {}, - "outputs": [], - "source": [ - "james_bond_df_parquet = pd.read_parquet(\"james_bond_data.parquet\")\n", - "james_bond_df_parquet.head()" - ] - }, - { - "cell_type": "markdown", - "id": "e432b28e-257b-422b-b2f8-06f41608391b", - "metadata": {}, - "source": [ - "## Dealing With Missing Data and Invalid Data Types" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "38eb1abb-9f89-4a53-9e77-f7c71dbeff18", - "metadata": {}, - "outputs": [], - "source": [ - "james_bond_df.info()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b29d5a34-c930-4ce2-898c-b9e8aa7f771d", - "metadata": {}, - "outputs": [], - "source": [ - "james_bond_df[james_bond_df.isna().any(axis=\"columns\")]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1db9201a-11c1-4cdd-9625-d70cee736191", - "metadata": {}, - "outputs": [], - "source": [ - "james_bond_df.at[10, \"Avg_User_IMDB\"] = 7.1\n", - "james_bond_df.at[10, \"Avg_User_Rtn_Tom\"] = 6.8" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "001996e3-2fce-4228-a873-b78eef613bba", - "metadata": {}, - "outputs": [], - "source": [ - "james_bond_df[\n", - " [\"US_Gross\", \"World_Gross\", \"Budget ($ 000s)\", \"Film_Length\"]\n", - "].head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "880e4710-1c11-4de2-a2c3-97a9672ce6f7", - "metadata": {}, - "outputs": [], - "source": [ - "james_bond_df[\"US_Gross\"] = (\n", - " james_bond_df[\"US_Gross\"].replace(\"[$,]\", \"\", regex=True)\n", - ").astype(float)\n", - "\n", - "james_bond_df[\"World_Gross\"] = (\n", - " james_bond_df[\"World_Gross\"].replace(\"[$,]\", \"\", regex=True)\n", - ").astype(float)\n", - "\n", - "james_bond_df[\"Budget ($ 000s)\"] = (\n", - " james_bond_df[\"Budget ($ 000s)\"].replace(\"[$,]\", \"\", regex=True)\n", - ").astype(float)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ae9c1d1b-a620-43c5-a199-eb6a7bff7ce2", - "metadata": {}, - "outputs": [], - "source": [ - "james_bond_df[\"Film_Length\"] = (\n", - " james_bond_df[\"Film_Length\"].str.rstrip(\"mins\").astype(int)\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ed0ead0e-7310-4c82-86d5-2480a95f1525", - "metadata": {}, - "outputs": [], - "source": [ - "james_bond_df[\"Release\"] = pd.to_datetime(\n", - " james_bond_df[\"Release\"], format=\"%B, %Y\"\n", - ")\n", - "james_bond_df[\"Release_Year\"] = james_bond_df[\"Release\"].dt.year" - ] - }, - { - "cell_type": "markdown", - "id": "89653d81-3bcd-4078-83cb-ad4b2fa560e6", - "metadata": {}, - "source": [ - "## Dealing With Inconsistencies in Data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bc483320-7895-4368-a672-b98f8d0c9755", - "metadata": {}, - "outputs": [], - "source": [ - "james_bond_df[\"Budget ($ 000s)\"] = james_bond_df[\"Budget ($ 000s)\"] * 1000\n", - "james_bond_df.rename(columns={\"Budget ($ 000s)\": \"Budget\"}, inplace=True)" - ] - }, - { - "cell_type": "markdown", - "id": "3e129b32-5e66-41cb-b938-8fd58bb94116", - "metadata": {}, - "source": [ - "## Removing Duplicate Data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "be7aad8b-ef3f-48a6-a9a0-de909133921f", - "metadata": {}, - "outputs": [], - "source": [ - "james_bond_df[\"Movie\"].value_counts().head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "20067efb-e7c7-4690-b483-1d29847ad24f", - "metadata": {}, - "outputs": [], - "source": [ - "duplicate_movies = [\"The Man with the Golden Gun\", \"The Living Daylights\"]\n", - "james_bond_df[james_bond_df[\"Movie\"].isin(duplicate_movies)]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c98c7640-1472-4869-9fdd-f070d665ae1d", - "metadata": {}, - "outputs": [], - "source": [ - "james_bond_df.drop_duplicates(inplace=True, ignore_index=True)" - ] - }, - { - "cell_type": "markdown", - "id": "8bdaa8b1-9f2e-46a5-b53a-c1ae4c201c99", - "metadata": {}, - "source": [ - "## Removing Typos" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e442e51a-28fd-42d7-94b0-aaf1abe5d9a8", - "metadata": {}, - "outputs": [], - "source": [ - "james_bond_df[\"Bond\"].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f9863aa7-b5db-4ab1-be63-727ff437b63b", - "metadata": {}, - "outputs": [], - "source": [ - "james_bond_df[\"Bond\"] = james_bond_df[\"Bond\"].str.replace(\"Shawn\", \"Sean\")\n", - "james_bond_df[\"Bond\"] = james_bond_df[\"Bond\"].str.replace(\"MOORE\", \"Moore\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "25da4a99-6b90-4785-aaa4-48bed819e9be", - "metadata": {}, - "outputs": [], - "source": [ - "james_bond_df[\"Bond\"].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a26b138d-72e5-4e15-a875-ee65023545d1", - "metadata": {}, - "outputs": [], - "source": [ - "james_bond_df[\"Bond_Car_MFG\"].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8260f6b1-6d7f-4338-95b7-8946d69a92e2", - "metadata": {}, - "outputs": [], - "source": [ - "james_bond_df[\"Bond_Car_MFG\"] = james_bond_df[\"Bond_Car_MFG\"].str.replace(\n", - " \"Astin\", \"Aston\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "50c80bc8-fdb9-4c28-af5a-cd6b66c7a01d", - "metadata": {}, - "source": [ - "## Checking for Invalid Outliers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "81338285-2067-46fc-82f8-360d92ec7153", - "metadata": {}, - "outputs": [], - "source": [ - "james_bond_df[[\"Film_Length\", \"Martinis\"]].describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d84acfc8-fb2d-4c45-bfac-20469fc1de97", - "metadata": {}, - "outputs": [], - "source": [ - "james_bond_df[\"Film_Length\"].replace(1200, 120, inplace=True)\n", - "james_bond_df[\"Martinis\"].replace(-6, 6, inplace=True)" - ] - }, - { - "cell_type": "markdown", - "id": "52db1351-36ed-4104-a999-345ebbc62214", - "metadata": {}, - "source": [ - "## Storing Your Cleansed Data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "575a774e-6913-41fb-8ff9-4d786f478007", - "metadata": {}, - "outputs": [], - "source": [ - "james_bond_df.to_csv(\"james_bond_data_cleansed.csv\", index=False)" - ] - }, - { - "cell_type": "markdown", - "id": "f50918ee-e61f-46b2-b0c2-1ffa2c62bbc0", - "metadata": {}, - "source": [ - "## Using Python for Data Analysis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bee6d6cb-e418-4c1d-8b75-604b9ab2e63d", - "metadata": {}, - "outputs": [], - "source": [ - "!python -m pip install matplotlib scikit-learn numpy" - ] - }, - { - "cell_type": "markdown", - "id": "0246dcb1-88fc-4a3e-acc1-571037390e09", - "metadata": {}, - "source": [ - "## Performing a Regression Analysis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "27d0a3dd-e71a-4b8a-883c-40cb5c001f7e", - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "\n", - "# x = james_bond_df[\"Avg_User_IMDB\"].values.reshape(-1, 1)\n", - "# y = james_bond_df[\"Avg_User_Rtn_Tom\"].values.reshape(-1, 1)\n", - "\n", - "x = james_bond_df[\"Avg_User_IMDB\"].array.reshape(-1, 1)\n", - "y = james_bond_df[\"Avg_User_Rtn_Tom\"].array.reshape(-1, 1)\n", - "\n", - "plt.title(\"Scatter Plot of Ratings.\")\n", - "plt.xlabel(\"Average IMDb Rating\")\n", - "plt.ylabel(\"Average Rotten Tomatoes Rating\")\n", - "plt.scatter(x, y)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "669fb9d7-d744-4e6b-899e-a69aebec53ed", - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.linear_model import LinearRegression\n", - "import matplotlib.pyplot as plt\n", - "\n", - "x = james_bond_df[\"Avg_User_IMDB\"].array.reshape(-1, 1)\n", - "y = james_bond_df[\"Avg_User_Rtn_Tom\"].array.reshape(-1, 1)\n", - "\n", - "model = LinearRegression()\n", - "model.fit(x, y)\n", - "\n", - "r_squared = f\"R-Squared: {round(model.score(x, y),2)}\"\n", - "best_fit_equation = (\n", - " f\"y={round(model.coef_[0][0], 4)}x{round(model.intercept_[0], 4)}\"\n", - ")\n", - "y_pred = model.predict(x)\n", - "\n", - "plt.title(\"Scatter Plot of Ratings.\")\n", - "plt.xlabel(\"Average IMDb Rating\")\n", - "plt.ylabel(\"Average Rotten Tomatoes Rating\")\n", - "plt.scatter(x, y)\n", - "plt.text(7.25, 5.5, r_squared, fontsize=10)\n", - "plt.text(7.25, 7, best_fit_equation, fontsize=10)\n", - "plt.plot(x, y_pred, color=\"red\")" - ] - }, - { - "cell_type": "markdown", - "id": "b38df412-c320-49fb-93ae-e253405537a8", - "metadata": {}, - "source": [ - "## Investigating a Statistical Distribution" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "938e5942-e57f-4e41-99f1-215cfb37d0df", - "metadata": {}, - "outputs": [], - "source": [ - "film_length_groups = james_bond_df[\"Film_Length\"].value_counts(\n", - " bins=7, sort=False\n", - ")\n", - "film_length_groups.plot(kind=\"bar\", title=\"Film Length Distribution\").set(\n", - " xlabel=\"Time Range (mins)\", ylabel=\"Count\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ff4e9955-baf4-48eb-b032-fbf55f439194", - "metadata": {}, - "outputs": [], - "source": [ - "james_bond_df[\"Film_Length\"].agg([\"mean\", \"max\", \"min\", \"std\"])" - ] - }, - { - "cell_type": "markdown", - "id": "1b14c433-c3a6-4484-bc0a-26825bd1e870", - "metadata": {}, - "source": [ - "## Finding No Relationship" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2bb83374-347f-4cf6-bc21-8180a003371d", - "metadata": {}, - "outputs": [], - "source": [ - "x = james_bond_df[\"Avg_User_IMDB\"].array.reshape(-1, 1)\n", - "y = james_bond_df[\"Kills_Bond\"].array.reshape(-1, 1)\n", - "\n", - "plt.title(\"Scatter Plot of Kills vs Ratings.\")\n", - "plt.xlabel(\"Average IMDb Rating\")\n", - "plt.ylabel(\"Kills By Bond\")\n", - "plt.scatter(x, y)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}