automl
diff --git a/‎development/.buildinfo
Lines changed: 1 addition & 1 deletion b/‎development/.buildinfo
Lines changed: 1 addition & 1 deletion
diff --git a/‎development/_downloads/4f9b78e1d6464520c85232e30bf19d2b/example_text_preprocessing.ipynb
Lines changed: 72 additions & 0 deletions b/‎development/_downloads/4f9b78e1d6464520c85232e30bf19d2b/example_text_preprocessing.ipynb
Lines changed: 72 additions & 0 deletions
diff --git a/‎development/_downloads/89acefb6af0174645412e5af4eafade1/example_text_preprocessing.py
Lines changed: 67 additions & 0 deletions b/‎development/_downloads/89acefb6af0174645412e5af4eafade1/example_text_preprocessing.py
Lines changed: 67 additions & 0 deletions
diff --git a/‎development/_downloads/bc82bea3a5dd7bdba60b65220891d9e5/examples_python.zip
2.46 KB b/‎development/_downloads/bc82bea3a5dd7bdba60b65220891d9e5/examples_python.zip
2.46 KB
diff --git a/‎development/_downloads/fb625db3c50d423b1b7881136ffdeec8/examples_jupyter.zip
3.61 KB b/‎development/_downloads/fb625db3c50d423b1b7881136ffdeec8/examples_jupyter.zip
3.61 KB
diff --git a/‎development/_images/sphx_glr_example_inspect_predictions_001.png
786 Bytes b/‎development/_images/sphx_glr_example_inspect_predictions_001.png
786 Bytes
diff --git a/‎development/_images/sphx_glr_example_inspect_predictions_002.png
-3.29 KB b/‎development/_images/sphx_glr_example_inspect_predictions_002.png
-3.29 KB
diff --git a/‎development/_images/sphx_glr_example_inspect_predictions_003.png
-1.7 KB b/‎development/_images/sphx_glr_example_inspect_predictions_003.png
-1.7 KB
diff --git a/‎development/_images/sphx_glr_example_inspect_predictions_thumb.png
434 Bytes b/‎development/_images/sphx_glr_example_inspect_predictions_thumb.png
434 Bytes
diff --git a/‎development/_images/sphx_glr_example_pandas_train_test_001.png
-147 Bytes b/‎development/_images/sphx_glr_example_pandas_train_test_001.png
-147 Bytes
diff --git a/‎development/_images/sphx_glr_example_pandas_train_test_thumb.png
284 Bytes b/‎development/_images/sphx_glr_example_pandas_train_test_thumb.png
284 Bytes
diff --git a/‎development/_images/sphx_glr_example_regression_001.png
662 Bytes b/‎development/_images/sphx_glr_example_regression_001.png
662 Bytes
diff --git a/‎development/_images/sphx_glr_example_regression_thumb.png
966 Bytes b/‎development/_images/sphx_glr_example_regression_thumb.png
966 Bytes
diff --git a/‎development/_images/sphx_glr_example_text_preprocessing_thumb.png
26.2 KB b/‎development/_images/sphx_glr_example_text_preprocessing_thumb.png
26.2 KB
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: 8a26f7fbaa1576935d6b4916c5b79de9
+config: 19b39b196a4ce26d6f98b3eb2c061df5
 tags: 645f666f9bcd5a90fca523b33c5a78b7
@@ -0,0 +1,72 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n# Text Preprocessing\nThis example shows, how to use text features in *auto-sklearn*. *auto-sklearn* can automatically\nencode text features if they are provided as string type in a pandas dataframe.\n\nFor processing text features you need a pandas dataframe and set the desired\ntext columns to string and the categorical columns to category.\n\n*auto-sklearn* text embedding creates a bag of words count.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import sklearn.metrics\nimport sklearn.datasets\nimport autosklearn.classification"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Data Loading\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "X, y = sklearn.datasets.fetch_openml(data_id=40945, return_X_y=True)\n\n# by default, the columns which should be strings are not formatted as such\nprint(f\"{X.info()}\\n\")\n\n# manually convert these to string columns\nX = X.astype({'name': 'string', 'ticket': 'string', 'cabin': 'string', 'boat': 'string',\n              'home.dest': 'string'})\n\n# now *auto-sklearn* handles the string columns with its text feature preprocessing pipeline\n\nX_train, X_test, y_train, y_test = \\\n     sklearn.model_selection.train_test_split(X, y, random_state=1)\n\ncls = autosklearn.classification.AutoSklearnClassifier(\n    time_left_for_this_task=30,\n    # Bellow two flags are provided to speed up calculations\n    # Not recommended for a real implementation\n    initial_configurations_via_metalearning=0,\n    smac_scenario_args={'runcount_limit': 1},\n)\n\ncls.fit(X_train, y_train, X_test, y_test)\n\npredictions = cls.predict(X_test)\nprint(\"Accuracy score\", sklearn.metrics.accuracy_score(y_test, predictions))\n\n\nX, y = sklearn.datasets.fetch_openml(data_id=40945, return_X_y=True, as_frame=True)\nX = X.select_dtypes(exclude=['object'])\n\nX_train, X_test, y_train, y_test = \\\n     sklearn.model_selection.train_test_split(X, y, random_state=1)\n\ncls = autosklearn.classification.AutoSklearnClassifier(\n    time_left_for_this_task=30,\n    # Bellow two flags are provided to speed up calculations\n    # Not recommended for a real implementation\n    initial_configurations_via_metalearning=0,\n    smac_scenario_args={'runcount_limit': 1},\n)\n\ncls.fit(X_train, y_train, X_test, y_test)\n\npredictions = cls.predict(X_test)\nprint(\"Accuracy score without text preprocessing\", sklearn.metrics.accuracy_score(y_test, predictions))"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.8.12"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
@@ -0,0 +1,67 @@
+# -*- encoding: utf-8 -*-
+"""
+==================
+Text Preprocessing
+==================
+This example shows, how to use text features in *auto-sklearn*. *auto-sklearn* can automatically
+encode text features if they are provided as string type in a pandas dataframe.
+
+For processing text features you need a pandas dataframe and set the desired
+text columns to string and the categorical columns to category.
+
+*auto-sklearn* text embedding creates a bag of words count.
+"""
+import sklearn.metrics
+import sklearn.datasets
+import autosklearn.classification
+
+############################################################################
+# Data Loading
+# ============
+
+X, y = sklearn.datasets.fetch_openml(data_id=40945, return_X_y=True)
+
+# by default, the columns which should be strings are not formatted as such
+print(f"{X.info()}\n")
+
+# manually convert these to string columns
+X = X.astype({'name': 'string', 'ticket': 'string', 'cabin': 'string', 'boat': 'string',
+              'home.dest': 'string'})
+
+# now *auto-sklearn* handles the string columns with its text feature preprocessing pipeline
+
+X_train, X_test, y_train, y_test = \
+     sklearn.model_selection.train_test_split(X, y, random_state=1)
+
+cls = autosklearn.classification.AutoSklearnClassifier(
+    time_left_for_this_task=30,
+    # Bellow two flags are provided to speed up calculations
+    # Not recommended for a real implementation
+    initial_configurations_via_metalearning=0,
+    smac_scenario_args={'runcount_limit': 1},
+)
+
+cls.fit(X_train, y_train, X_test, y_test)
+
+predictions = cls.predict(X_test)
+print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
+
+
+X, y = sklearn.datasets.fetch_openml(data_id=40945, return_X_y=True, as_frame=True)
+X = X.select_dtypes(exclude=['object'])
+
+X_train, X_test, y_train, y_test = \
+     sklearn.model_selection.train_test_split(X, y, random_state=1)
+
+cls = autosklearn.classification.AutoSklearnClassifier(
+    time_left_for_this_task=30,
+    # Bellow two flags are provided to speed up calculations
+    # Not recommended for a real implementation
+    initial_configurations_via_metalearning=0,
+    smac_scenario_args={'runcount_limit': 1},
+)
+
+cls.fit(X_train, y_train, X_test, y_test)
+
+predictions = cls.predict(X_test)
+print("Accuracy score without text preprocessing", sklearn.metrics.accuracy_score(y_test, predictions))