diff --git a/notebooks/shapley_explainability/XGBoostGenericShapleyFraudDetection.ipynb b/notebooks/shapley_explainability/XGBoostGenericShapleyFraudDetection.ipynb new file mode 100644 index 0000000..18f7732 --- /dev/null +++ b/notebooks/shapley_explainability/XGBoostGenericShapleyFraudDetection.ipynb @@ -0,0 +1,683 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Explainable fraud detection model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example we develop a small fraud detection model for credit card transactions based on XGBoost, export it to TorchScript using Hummingbird (https://github.com/microsoft/hummingbird) and run Shapley Value Sampling explanations (see https://captum.ai/api/shapley_value_sampling.html for reference) on it, via torch script.\n", + "\n", + "We load both the original model and the explainability script in RedisAI and trigger them in a DAG." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For this example we use a dataset of transactions made by credit cards in September 2013 by European cardholders. \n", + "The dataset presents transactions that occurred in two days, with 492 frauds out of 284,807 transactions.\n", + "\n", + "The dataset is available at https://www.kaggle.com/mlg-ulb/creditcardfraud. For anonymity purposes, the features are 28 PCA features (V1 to V28), along with transaction Time and Amount.\n", + "\n", + "__In order to run this notebook please download the `creditcard.csv` file from Kaggle and place it in the `data/` directory.__\n", + "\n", + "Once the file is in place, we start by importing Pandas and reading the data. We create a dataframe of covariates and a dataframe of targets." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "df = pd.read_csv('data/creditcard.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "X = df.drop(['Class'], axis=1)\n", + "Y = df['Class']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We start off by randomly splitting train and test datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "seed = 7\n", + "test_size = 0.33\n", + "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next we use XGBoost to classify the transactions. Note that we convert the arguments to `fit` to NumPy arrays." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/dvirdukhan/Code/redisai-examples/venv/lib/python3.8/site-packages/xgboost/sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n", + " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[05:49:05] WARNING: ../src/learner.cc:573: \n", + "Parameters: { \"label_encoder\" } might not be used.\n", + "\n", + " This may not be accurate due to some parameters are only used in language bindings but\n", + " passed down to XGBoost core. Or some parameters are not used but slip through this\n", + " verification. Please open an issue if you find above cases.\n", + "\n", + "\n", + "[05:49:05] WARNING: ../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n" + ] + }, + { + "data": { + "text/plain": [ + "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n", + " colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,\n", + " importance_type='gain', interaction_constraints='',\n", + " label_encoder=False, learning_rate=0.300000012, max_delta_step=0,\n", + " max_depth=6, min_child_weight=1, missing=nan,\n", + " monotone_constraints='()', n_estimators=100, n_jobs=16,\n", + " num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,\n", + " scale_pos_weight=1, subsample=1, tree_method='exact',\n", + " validate_parameters=1, verbosity=None)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from xgboost import XGBClassifier\n", + "\n", + "model = XGBClassifier(label_encoder=False)\n", + "model.fit(X_train.to_numpy(), y_train.to_numpy())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We now obtain predictions on the test dataset and binarize the output probabilities to get a target." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "y_pred = model.predict(X_test.to_numpy())\n", + "predictions = [round(value) for value in y_pred]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We evaluate the accuracy of our model on the test set (this is just an example: the dataset is heavily unbalanced so accuracy is not a fair characterization in this case)." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 99.96%\n" + ] + } + ], + "source": [ + "from sklearn.metrics import accuracy_score, confusion_matrix\n", + "\n", + "accuracy = accuracy_score(y_test, predictions)\n", + "print(\"Accuracy: %.2f%%\" % (accuracy * 100.0))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looking at the confusion matrix gives a clearer representation." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[93813, 8],\n", + " [ 28, 138]])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "confusion_matrix(y_test, predictions)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We are interested to explore are cases of fraud, so we extract them from the test set." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "X_test_fraud = X_test[y_test == 1].to_numpy()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We verify how many times we are getting it right." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ True, True, True, True, True, True, True, False, True,\n", + " False, True, True, True, False, True, True, False, True,\n", + " True, True, True, True, True, True, True, True, True,\n", + " False, True, True, True, True, False, True, False, True,\n", + " False, True, True, True, True, False, False, True, True,\n", + " True, True, True, False, True, False, True, False, False,\n", + " True, True, True, True, True, True, True, False, True,\n", + " True, True, True, True, True, True, True, False, False,\n", + " True, True, False, True, True, True, True, True, True,\n", + " True, True, False, True, True, True, True, True, True,\n", + " True, True, True, False, False, True, True, True, True,\n", + " True, False, False, True, True, True, False, True, True,\n", + " True, True, True, True, True, True, True, False, True,\n", + " True, True, False, True, True, True, True, False, True,\n", + " True, True, True, True, True, True, True, True, True,\n", + " True, True, True, True, True, True, True, True, True,\n", + " True, True, True, True, True, True, True, True, True,\n", + " True, True, True, True, True, True, True, False, True,\n", + " True, True, True, True])" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.predict(X_test_fraud) == 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exporting to TorchScript with Hummingbird" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From the project page (https://github.com/microsoft/hummingbird):\n", + "\n", + "> Hummingbird is a library for compiling trained traditional ML models into tensor computations. Hummingbird allows users to seamlessly leverage neural network frameworks (such as PyTorch) to accelerate traditional ML models.\n", + "\n", + "Hummingbird can take scikit-learn, XGBoost or LightGBM models and export them to PyTorch, TorchScript, ONNX and TVM. This works very well for running ML models on RedisAI and take advantage of vectorized CPU instructions or GPU.\n", + "\n", + "We choose to convert the boosted tree to tensor computations using the `gemm` implementation." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from hummingbird.ml import convert, load" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "extra_config={\n", + " \"tree_implementation\": \"gemm\"\n", + "}\n", + "\n", + "hummingbird_model = convert(model, 'torchscript', test_input=X_test_fraud, extra_config=extra_config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "At this point, `hm_model` is an object containing a TorchScript model that is ready to be exported." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "torch.jit.save(hummingbird_model.model, \"models/fraud_detection_model.pt\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can verify everything works by loading the model and running a prediction. The model outputs a tuple containing the predicted classes and the output probabilities." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "loaded_model = torch.jit.load(\"models/fraud_detection_model.pt\")\n", + "\n", + "X_test_fraud_tensor = torch.from_numpy(X_test_fraud)\n", + "\n", + "loaded_output_classes, loaded_output_probs = loaded_model(X_test_fraud_tensor)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can now compare against the original output from the XGBoost model." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "xgboost_output_classes = torch.from_numpy(model.predict(X_test_fraud))\n", + "\n", + "torch.equal(loaded_output_classes, xgboost_output_classes)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explainer Script" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The script `torch_shapely.py` is a torch script degined specificly running on RedisAI, and utilizes RedisAI extension for torch script, that allows to run any model stored in RedisAI from within the script. Let's go over the details:\n", + "\n", + "In RedisAI, each entry point (function in script) should have the signature:\n", + "`function_name(tensors: List[Tensor], keys: List[str], args: List[str]):`\n", + "In our case our entry point is `shapely_sample(tensors: List[Tensor], keys: List[str], args: List[str]):` and the parameters are:\n", + "```\n", + "Tensors:\n", + " tensors[0] - x : Input tensor to the model\n", + " tensors[1] - baselines : Optional - reference values which replace each feature when\n", + " ablated; if no baselines are provided, baselines are set\n", + " to all zeros\n", + "\n", + "Keys:\n", + " keys[0] - model_key: Redis key name where the model is stored as RedisAI model.\n", + " \n", + "Args:\n", + " args[0] - n_samples: number of random feature permutations performed\n", + " args[1] - number_of_outputs - number of model outputs\n", + " args[2] - output_tensor_index - index of the tested output tensor\n", + " args[3] - Optional - target: output indices for which Shapley Value Sampling is\n", + " computed; if model returns a single scalar, target can be\n", + " None\n", + "```\n", + "\n", + "The script will create `n_samples` amount of permutations of the input features. For each permutation it will check for each feature what was its contribution to the result by running the model repeatedly on a new subset of input features.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Serving model and explainer in RedisAI" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "At this point we can load the model we exported into RedisAI and serve it from there. We will also load the `torch_shapely.py` script, that allows calculating the Shapely value of a model, from within RedisAI. After making sure RedisAI is running, we initialize the client." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import redisai\n", + "\n", + "rai = redisai.Client()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We read the model and the script." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"models/fraud_detection_model.pt\", \"rb\") as f:\n", + " fraud_detection_model_blob = f.read()\n", + "\n", + "with open(\"torch_shapley.py\", \"rb\") as f:\n", + " shapely_script = f.read()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We load both model and script into RedisAI." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'OK'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rai.modelstore(\"fraud_detection_model\", \"TORCH\", \"CPU\", fraud_detection_model_blob)\n", + "rai.scriptstore(\"shapley_script\", device='CPU', script=shapely_script, entry_points=[\"shapley_sample\"] )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "All set, it's now test time. We reuse our `X_test_fraud` NumPy array we created previously. We set it, and run the Shapley script and get explanations as arrays." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Winning feature: 14\n" + ] + } + ], + "source": [ + "rai.tensorset(\"fraud_input\", X_test_fraud, dtype=\"float\")\n", + "\n", + "rai.scriptexecute(\"shapley_script\", \"shapley_sample\", inputs = [\"fraud_input\"], keys = [\"fraud_detection_model\"], args = [\"20\", \"2\", \"0\"], outputs=[\"fraud_explanations\"])\n", + "\n", + "rai_expl = rai.tensorget(\"fraud_explanations\")\n", + "\n", + "winning_feature_redisai = np.argmax(rai_expl[0], axis=0)\n", + "\n", + "print(\"Winning feature: %d\" % winning_feature_redisai)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively we can set up a RedisAI DAG and run it in one swoop." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dag = rai.dag(routing =\"fraud_detection_model\")\n", + "dag.tensorset(\"fraud_input\", X_test_fraud, dtype=\"float\")\n", + "dag.modelexecute(\"fraud_detection_model\", \"fraud_input\", [\"fraud_pred\", \"fraud_prob\"])\n", + "dag.scriptexecute(\"shapely_script\", \"shapely_sample\", inputs = [\"fraud_input\"], keys = [\"fraud_detection_model\"], args = [\"20\", \"2\", \"0\"], outputs=[\"fraud_explanations\"])\n", + "dag.tensorget(\"fraud_pred\")\n", + "dag.tensorget(\"fraud_explanations\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We now set the input and request a DAG execution, which will produce the desired outputs." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# rai.tensorset(\"fraud_input\", X_test_fraud, dtype=\"float\")\n", + "\n", + "_, _, _, dag_pred, dag_expl = dag.execute()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,\n", + " 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1])" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dag_pred" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can now check that the winning feature matches with what we computed earlier on the first sample in the test batch." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Winning feature: 14\n" + ] + } + ], + "source": [ + "winning_feature_redisai_dag = np.argmax(dag_expl[0])\n", + "\n", + "print(\"Winning feature: %d\" % winning_feature_redisai_dag)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0. , -0.05, 0. , 0.05, 0.2 , 0. , 0. , 0. , 0.05,\n", + " 0. , 0. , 0. , 0.3 , 0. , 0.4 , 0. , 0. , -0.05,\n", + " 0. , 0.05, 0. , 0.05, 0. , -0.05, 0.05, 0. , 0. ,\n", + " 0. , 0. , 0. ])" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dag_expl[1]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/shapley_explainability/torch_shapley.py b/notebooks/shapley_explainability/torch_shapley.py new file mode 100644 index 0000000..07a6bb7 --- /dev/null +++ b/notebooks/shapley_explainability/torch_shapley.py @@ -0,0 +1,93 @@ + + +# TorchScript implementation of Shapley Value Sampling. +# See https://captum.ai/api/shapley_value_sampling.html for +# reference. From that source: +# A perturbation based approach to compute attribution, based on the concept +# of Shapley Values from cooperative game theory. This method involves taking +# a random permutation of the input features and adding them one-by-one to the +# given baseline. The output difference after adding each feature corresponds +# to its attribution, and these difference are averaged when repeating this +# process n_samples times, each time choosing a new random permutation of +# the input features. + + +# Script inputs + +# Tensors: +# tensors[0] - x : Input tensor to the model +# tensors[1] - baselines : Optional - reference values which replace each feature when +# ablated; if no baselines are provided, baselines are set +# to all zeros + +# Keys: +# keys[0] - model_key: Redis key name where the model is stored as RedisAI model. + +# Args: +# args[0] - n_samples: number of random feature permutations performed +# args[1] - number_of_outputs - number of model outputs +# args[2] - output_tensor_index - index of the tested output tensor +# args[3] - Optional - target: output indices for which Shapley Value Sampling is +# computed; if model returns a single scalar, target can be +# None + + + +def generate_permutations(x, n_samples:int): + n_features = torch.numel(x[0]) + return [torch.randperm(n_features) for _ in range(n_samples)] + +def index_with_target(x, target:int): + # Transpose is used in order to avoid iterating batches + x_t = torch.transpose(x, 0, -1) + x_target_t = x_t[target] + return torch.transpose(x_target_t, 0, -1) + +# binary classification - no need for target (output size is 1) +# multiple output (output vector - target specifies the output index to explain. +def shapley_sample(tensors: List[Tensor], keys: List[str], args: List[str]): + model_key = keys[0] + x = tensors[0] + n_samples = int(args[0]) + number_of_outputs = int(args[1]) + output_tensor_index = int(args[2]) + if(len(args) == 4): + target = int(args[3]) + else: + target = None + + + attrib = torch.zeros_like(x) + + if len(tensors) == 2: + baselines = tensors[1] + else: + baselines = torch.zeros_like(x) + + permutations = generate_permutations(x, n_samples) + + n_features = torch.numel(x[0]) + + for permutation in permutations: + current = x.clone() + for batch_i in range(current.shape[0]): + current[batch_i] = baselines[ + int(torch.randint(low=0, high=baselines.shape[0], size=(1,))) + ] + prev_out = redisAI.model_execute(model_key, [current], number_of_outputs) + prev_out_target = index_with_target(prev_out[output_tensor_index], target) if target is not None else prev_out[output_tensor_index] + + + # Check how current output target differs from the prev output target + for feature_i in range(n_features): + permuted_feature_i = int(permutation[feature_i]) + current[:, permuted_feature_i] = x[:, permuted_feature_i] + out = redisAI.model_execute(model_key, [current], number_of_outputs) + out_target = index_with_target(out[output_tensor_index], target) if target is not None else out[output_tensor_index] + # Add the contribution of the feature added in current iteration + attrib[:, permuted_feature_i] += out_target - prev_out_target + prev_out_target = out_target + + attrib /= n_samples + + return attrib