Skip to content

Commit 94cd247

Browse files
author
Github Actions
committed
Eddie Bergman: Doc: Adds documentation for the dataset compression argument from #1341 and #1250 (#1386)
1 parent 7cef83d commit 94cd247

File tree

96 files changed

+4140
-4039
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

96 files changed

+4140
-4039
lines changed

development/.buildinfo

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# Sphinx build info version 1
22
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3-
config: 8a26f7fbaa1576935d6b4916c5b79de9
3+
config: 19b39b196a4ce26d6f98b3eb2c061df5
44
tags: 645f666f9bcd5a90fca523b33c5a78b7
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n# Text Preprocessing\nThis example shows, how to use text features in *auto-sklearn*. *auto-sklearn* can automatically\nencode text features if they are provided as string type in a pandas dataframe.\n\nFor processing text features you need a pandas dataframe and set the desired\ntext columns to string and the categorical columns to category.\n\n*auto-sklearn* text embedding creates a bag of words count.\n"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {
25+
"collapsed": false
26+
},
27+
"outputs": [],
28+
"source": [
29+
"import sklearn.metrics\nimport sklearn.datasets\nimport autosklearn.classification"
30+
]
31+
},
32+
{
33+
"cell_type": "markdown",
34+
"metadata": {},
35+
"source": [
36+
"## Data Loading\n\n"
37+
]
38+
},
39+
{
40+
"cell_type": "code",
41+
"execution_count": null,
42+
"metadata": {
43+
"collapsed": false
44+
},
45+
"outputs": [],
46+
"source": [
47+
"X, y = sklearn.datasets.fetch_openml(data_id=40945, return_X_y=True)\n\n# by default, the columns which should be strings are not formatted as such\nprint(f\"{X.info()}\\n\")\n\n# manually convert these to string columns\nX = X.astype({'name': 'string', 'ticket': 'string', 'cabin': 'string', 'boat': 'string',\n 'home.dest': 'string'})\n\n# now *auto-sklearn* handles the string columns with its text feature preprocessing pipeline\n\nX_train, X_test, y_train, y_test = \\\n sklearn.model_selection.train_test_split(X, y, random_state=1)\n\ncls = autosklearn.classification.AutoSklearnClassifier(\n time_left_for_this_task=30,\n # Bellow two flags are provided to speed up calculations\n # Not recommended for a real implementation\n initial_configurations_via_metalearning=0,\n smac_scenario_args={'runcount_limit': 1},\n)\n\ncls.fit(X_train, y_train, X_test, y_test)\n\npredictions = cls.predict(X_test)\nprint(\"Accuracy score\", sklearn.metrics.accuracy_score(y_test, predictions))\n\n\nX, y = sklearn.datasets.fetch_openml(data_id=40945, return_X_y=True, as_frame=True)\nX = X.select_dtypes(exclude=['object'])\n\nX_train, X_test, y_train, y_test = \\\n sklearn.model_selection.train_test_split(X, y, random_state=1)\n\ncls = autosklearn.classification.AutoSklearnClassifier(\n time_left_for_this_task=30,\n # Bellow two flags are provided to speed up calculations\n # Not recommended for a real implementation\n initial_configurations_via_metalearning=0,\n smac_scenario_args={'runcount_limit': 1},\n)\n\ncls.fit(X_train, y_train, X_test, y_test)\n\npredictions = cls.predict(X_test)\nprint(\"Accuracy score without text preprocessing\", sklearn.metrics.accuracy_score(y_test, predictions))"
48+
]
49+
}
50+
],
51+
"metadata": {
52+
"kernelspec": {
53+
"display_name": "Python 3",
54+
"language": "python",
55+
"name": "python3"
56+
},
57+
"language_info": {
58+
"codemirror_mode": {
59+
"name": "ipython",
60+
"version": 3
61+
},
62+
"file_extension": ".py",
63+
"mimetype": "text/x-python",
64+
"name": "python",
65+
"nbconvert_exporter": "python",
66+
"pygments_lexer": "ipython3",
67+
"version": "3.8.12"
68+
}
69+
},
70+
"nbformat": 4,
71+
"nbformat_minor": 0
72+
}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# -*- encoding: utf-8 -*-
2+
"""
3+
==================
4+
Text Preprocessing
5+
==================
6+
This example shows, how to use text features in *auto-sklearn*. *auto-sklearn* can automatically
7+
encode text features if they are provided as string type in a pandas dataframe.
8+
9+
For processing text features you need a pandas dataframe and set the desired
10+
text columns to string and the categorical columns to category.
11+
12+
*auto-sklearn* text embedding creates a bag of words count.
13+
"""
14+
import sklearn.metrics
15+
import sklearn.datasets
16+
import autosklearn.classification
17+
18+
############################################################################
19+
# Data Loading
20+
# ============
21+
22+
X, y = sklearn.datasets.fetch_openml(data_id=40945, return_X_y=True)
23+
24+
# by default, the columns which should be strings are not formatted as such
25+
print(f"{X.info()}\n")
26+
27+
# manually convert these to string columns
28+
X = X.astype({'name': 'string', 'ticket': 'string', 'cabin': 'string', 'boat': 'string',
29+
'home.dest': 'string'})
30+
31+
# now *auto-sklearn* handles the string columns with its text feature preprocessing pipeline
32+
33+
X_train, X_test, y_train, y_test = \
34+
sklearn.model_selection.train_test_split(X, y, random_state=1)
35+
36+
cls = autosklearn.classification.AutoSklearnClassifier(
37+
time_left_for_this_task=30,
38+
# Bellow two flags are provided to speed up calculations
39+
# Not recommended for a real implementation
40+
initial_configurations_via_metalearning=0,
41+
smac_scenario_args={'runcount_limit': 1},
42+
)
43+
44+
cls.fit(X_train, y_train, X_test, y_test)
45+
46+
predictions = cls.predict(X_test)
47+
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
48+
49+
50+
X, y = sklearn.datasets.fetch_openml(data_id=40945, return_X_y=True, as_frame=True)
51+
X = X.select_dtypes(exclude=['object'])
52+
53+
X_train, X_test, y_train, y_test = \
54+
sklearn.model_selection.train_test_split(X, y, random_state=1)
55+
56+
cls = autosklearn.classification.AutoSklearnClassifier(
57+
time_left_for_this_task=30,
58+
# Bellow two flags are provided to speed up calculations
59+
# Not recommended for a real implementation
60+
initial_configurations_via_metalearning=0,
61+
smac_scenario_args={'runcount_limit': 1},
62+
)
63+
64+
cls.fit(X_train, y_train, X_test, y_test)
65+
66+
predictions = cls.predict(X_test)
67+
print("Accuracy score without text preprocessing", sklearn.metrics.accuracy_score(y_test, predictions))
Binary file not shown.
Binary file not shown.
Loading
Loading
Loading
Loading
Loading
Loading
Loading
Loading
Loading

0 commit comments

Comments
 (0)