From 095dfdbb9d0d27ba644cd03dfd11cd50f14c94b9 Mon Sep 17 00:00:00 2001 From: vishal Date: Thu, 13 Jun 2019 14:20:05 +0000 Subject: [PATCH 1/4] Updating reviews, movie-ratings and mnist --- examples/mnist/implementations/models/dnn.py | 2 +- examples/mnist/resources/environments.yaml | 21 ++++- examples/mnist/resources/models.yaml | 28 +++---- examples/mnist/resources/raw_columns.yaml | 11 --- .../mnist/resources/transformed_columns.yaml | 7 +- examples/movie-ratings/resources/apis.yaml | 2 +- .../movie-ratings/resources/environments.yaml | 4 +- examples/movie-ratings/resources/models.yaml | 10 ++- .../resources/transformed_columns.yaml | 34 ++++---- .../implementations/aggregators/max_length.py | 4 +- .../implementations/aggregators/vocab.py | 6 +- .../transformers/tokenize_string_to_int.py | 10 +-- examples/reviews/resources/apis.yaml | 30 +++---- examples/reviews/resources/columns.yaml | 39 ++++++--- examples/reviews/resources/max_length.yaml | 6 -- examples/reviews/resources/models.yaml | 84 +++++++++---------- examples/reviews/resources/vocab.yaml | 15 ---- 17 files changed, 154 insertions(+), 159 deletions(-) delete mode 100644 examples/mnist/resources/raw_columns.yaml delete mode 100644 examples/reviews/resources/max_length.yaml delete mode 100644 examples/reviews/resources/vocab.yaml diff --git a/examples/mnist/implementations/models/dnn.py b/examples/mnist/implementations/models/dnn.py index 7f3c97d95b..4f5b9fe1dd 100644 --- a/examples/mnist/implementations/models/dnn.py +++ b/examples/mnist/implementations/models/dnn.py @@ -4,7 +4,7 @@ def create_estimator(run_config, model_config): feature_columns = [ tf.feature_column.numeric_column( - "image_pixels", shape=model_config["hparams"]["input_shape"] + model_config["input"]["image_pixels"], shape=model_config["hparams"]["input_shape"] ) ] diff --git a/examples/mnist/resources/environments.yaml b/examples/mnist/resources/environments.yaml index 660927f011..3eee3607e8 100644 --- a/examples/mnist/resources/environments.yaml +++ b/examples/mnist/resources/environments.yaml @@ -5,6 +5,21 @@ path: s3a://cortex-examples/mnist.csv csv_config: header: true - schema: - - image - - label + schema: [@image, @label] + +- kind: raw_column + name: image + type: STRING_COLUMN + required: true + +- kind: raw_column + name: label + type: INT_COLUMN + required: true + min: 0 + max: 9 + +- kind: transformed_column + name: image_pixels + transformer_path: implementations/transformers/decode_and_normalize.py + input: @image diff --git a/examples/mnist/resources/models.yaml b/examples/mnist/resources/models.yaml index a789ab9d0c..fac8d751ac 100644 --- a/examples/mnist/resources/models.yaml +++ b/examples/mnist/resources/models.yaml @@ -1,14 +1,12 @@ - kind: model name: dnn - path: implementations/models/dnn.py - type: classification - target_column: label - feature_columns: - - image_pixels + estimator_path: implementations/models/dnn.py + target_column: @label + input: + image: @image_pixels hparams: - learning_rate: 0.01 input_shape: [784] - output_shape: [10] + learning_rate: 0.01 hidden_units: [100, 200] data_partition_ratio: training: 0.7 @@ -16,11 +14,9 @@ - kind: model name: conv - path: implementations/models/custom.py - type: classification - target_column: label - feature_columns: - - image_pixels + estimator_path: implementations/models/custom.py + target_column: @label + feature_columns: [@image_pixels] hparams: layer_type: conv learning_rate: 0.01 @@ -38,11 +34,9 @@ - kind: model name: t2t - path: implementations/models/t2t.py - type: classification - target_column: label - feature_columns: - - image_pixels + estimator_path: implementations/models/t2t.py + target_column: @label + feature_columns: [@image_pixels] prediction_key: outputs hparams: input_shape: [28, 28, 1] diff --git a/examples/mnist/resources/raw_columns.yaml b/examples/mnist/resources/raw_columns.yaml deleted file mode 100644 index 742e506f9c..0000000000 --- a/examples/mnist/resources/raw_columns.yaml +++ /dev/null @@ -1,11 +0,0 @@ -- kind: raw_column - name: image - type: STRING_COLUMN - required: true - -- kind: raw_column - name: label - type: INT_COLUMN - required: true - min: 0 - max: 9 diff --git a/examples/mnist/resources/transformed_columns.yaml b/examples/mnist/resources/transformed_columns.yaml index 4c736b0880..8b13789179 100644 --- a/examples/mnist/resources/transformed_columns.yaml +++ b/examples/mnist/resources/transformed_columns.yaml @@ -1,6 +1 @@ -- kind: transformed_column - name: image_pixels - transformer_path: implementations/transformers/decode_and_normalize.py - inputs: - columns: - image: image + diff --git a/examples/movie-ratings/resources/apis.yaml b/examples/movie-ratings/resources/apis.yaml index 187089009a..d5e5b259dd 100644 --- a/examples/movie-ratings/resources/apis.yaml +++ b/examples/movie-ratings/resources/apis.yaml @@ -1,5 +1,5 @@ - kind: api name: ratings - model_name: basic_embedding + model: @basic_embedding compute: replicas: 1 diff --git a/examples/movie-ratings/resources/environments.yaml b/examples/movie-ratings/resources/environments.yaml index a8604c6525..8094c2c993 100644 --- a/examples/movie-ratings/resources/environments.yaml +++ b/examples/movie-ratings/resources/environments.yaml @@ -1,11 +1,13 @@ - kind: environment name: dev + log_level: + spark: "ALL" data: type: csv path: s3a://cortex-examples/movie-ratings.csv csv_config: header: true - schema: ['user_id','movie_id','rating','timestamp'] + schema: [@user_id, @movie_id, @rating, @timestamp] - kind: raw_column name: user_id diff --git a/examples/movie-ratings/resources/models.yaml b/examples/movie-ratings/resources/models.yaml index 8f5e0ef5c8..7361265327 100644 --- a/examples/movie-ratings/resources/models.yaml +++ b/examples/movie-ratings/resources/models.yaml @@ -1,8 +1,12 @@ - kind: model name: basic_embedding - type: regression - target_column: rating - feature_columns: [user_id_indexed, movie_id_indexed] + estimator: cortex.dnn_regressor + target_column: @rating + input: + categorical_columns_with_identity: + - col @user_id_indexed + + movie_id_indexed] aggregates: [user_id_index, movie_id_index] hparams: embedding_size: 10 diff --git a/examples/movie-ratings/resources/transformed_columns.yaml b/examples/movie-ratings/resources/transformed_columns.yaml index ac1a27250f..76dc9ffcfc 100644 --- a/examples/movie-ratings/resources/transformed_columns.yaml +++ b/examples/movie-ratings/resources/transformed_columns.yaml @@ -1,31 +1,33 @@ +- kind: aggregate + name: user_id_size + aggregator: cortex.count_distinct + input: @user_id + - kind: aggregate name: user_id_index aggregator: cortex.index_string - inputs: - columns: - col: user_id + input: @user_id - kind: transformed_column name: user_id_indexed transformer: cortex.index_string - inputs: - columns: - text: user_id - args: - indexes: user_id_index + input: + col: user_id + indexes: user_id_index + +- kind: aggregate + name: movie_id_size + aggregator: cortex.count_distinct + input: @movie_id - kind: aggregate name: movie_id_index aggregator: cortex.index_string - inputs: - columns: - col: movie_id + input: @movie_id - kind: transformed_column name: movie_id_indexed transformer: cortex.index_string - inputs: - columns: - text: movie_id - args: - indexes: movie_id_index + input: + col: movie_id + indexes: movie_id_index diff --git a/examples/reviews/implementations/aggregators/max_length.py b/examples/reviews/implementations/aggregators/max_length.py index 5552024dbf..8034888dd9 100644 --- a/examples/reviews/implementations/aggregators/max_length.py +++ b/examples/reviews/implementations/aggregators/max_length.py @@ -1,9 +1,9 @@ -def aggregate_spark(data, columns, args): +def aggregate_spark(data, input): from pyspark.ml.feature import RegexTokenizer import pyspark.sql.functions as F from pyspark.sql.types import IntegerType - regexTokenizer = RegexTokenizer(inputCol=columns["col"], outputCol="token_list", pattern="\\W") + regexTokenizer = RegexTokenizer(inputCol=input, outputCol="token_list", pattern="\\W") regexTokenized = regexTokenizer.transform(data) max_review_length_row = ( diff --git a/examples/reviews/implementations/aggregators/vocab.py b/examples/reviews/implementations/aggregators/vocab.py index 39e0dc7c71..7377585eb8 100644 --- a/examples/reviews/implementations/aggregators/vocab.py +++ b/examples/reviews/implementations/aggregators/vocab.py @@ -1,8 +1,8 @@ -def aggregate_spark(data, columns, args): +def aggregate_spark(data, input): import pyspark.sql.functions as F from pyspark.ml.feature import RegexTokenizer - regexTokenizer = RegexTokenizer(inputCol=columns["col"], outputCol="token_list", pattern="\\W") + regexTokenizer = RegexTokenizer(inputCol=input["col"], outputCol="token_list", pattern="\\W") regexTokenized = regexTokenizer.transform(data) vocab_rows = ( @@ -10,7 +10,7 @@ def aggregate_spark(data, columns, args): .groupBy("word") .count() .orderBy(F.col("count").desc()) - .limit(args["vocab_size"]) + .limit(input["vocab_size"]) .select("word") .collect() ) diff --git a/examples/reviews/implementations/transformers/tokenize_string_to_int.py b/examples/reviews/implementations/transformers/tokenize_string_to_int.py index 1ab078cc69..151c18f754 100644 --- a/examples/reviews/implementations/transformers/tokenize_string_to_int.py +++ b/examples/reviews/implementations/transformers/tokenize_string_to_int.py @@ -3,19 +3,19 @@ non_word = re.compile("\\W") -def transform_python(sample, args): - text = sample["col"].lower() +def transform_python(input): + text = input["col"].lower() token_index_list = [] - vocab = args["vocab"] + vocab = input["vocab"] for token in non_word.split(text): if len(token) == 0: continue token_index_list.append(vocab.get(token, vocab[""])) - if len(token_index_list) == args["max_len"]: + if len(token_index_list) == input["max_len"]: break - for i in range(args["max_len"] - len(token_index_list)): + for i in range(input["max_len"] - len(token_index_list)): token_index_list.append(vocab[""]) return token_index_list diff --git a/examples/reviews/resources/apis.yaml b/examples/reviews/resources/apis.yaml index 56b549968b..6c55273ac3 100644 --- a/examples/reviews/resources/apis.yaml +++ b/examples/reviews/resources/apis.yaml @@ -1,17 +1,17 @@ -- kind: api - name: sentiment-dnn - model_name: sentiment_dnn - compute: - replicas: 1 +# - kind: api +# name: sentiment-dnn +# model_name: sentiment_dnn +# compute: +# replicas: 1 -- kind: api - name: sentiment-linear - model_name: sentiment_linear - compute: - replicas: 1 +# - kind: api +# name: sentiment-linear +# model: @sentiment_linear +# compute: +# replicas: 1 -- kind: api - name: sentiment-t2t - model_name: transformer - compute: - replicas: 1 +# - kind: api +# name: sentiment-t2t +# model_name: transformer +# compute: +# replicas: 1 diff --git a/examples/reviews/resources/columns.yaml b/examples/reviews/resources/columns.yaml index 562a132d56..09fa2a033b 100644 --- a/examples/reviews/resources/columns.yaml +++ b/examples/reviews/resources/columns.yaml @@ -1,28 +1,43 @@ - kind: environment name: dev + log_level: + spark: INFO data: type: csv path: s3a://cortex-examples/reviews.csv csv_config: header: true escape: "\"" - schema: ["review", "label"] + schema: [@review, @label] + +- kind: aggregate + name: max_review_length + aggregator_path: implementations/aggregators/max_length.py + input: @review + +- kind: aggregate + name: reviews_vocab + aggregator_path: implementations/aggregators/vocab.py + input: + col: @review + vocab_size: 10000 + +- kind: aggregate + name: label_index + aggregator: cortex.index_string + input: @label - kind: transformed_column name: embedding_input transformer_path: implementations/transformers/tokenize_string_to_int.py - inputs: - columns: - col: review - args: - max_len: max_review_length - vocab: reviews_vocab + input: + col: @review + max_len: @max_review_length + vocab: @reviews_vocab - kind: transformed_column name: label_indexed transformer: cortex.index_string - inputs: - columns: - text: label - args: - indexes: label_index + input: + col: @label + indexes: @label_index diff --git a/examples/reviews/resources/max_length.yaml b/examples/reviews/resources/max_length.yaml deleted file mode 100644 index 168190f03f..0000000000 --- a/examples/reviews/resources/max_length.yaml +++ /dev/null @@ -1,6 +0,0 @@ -- kind: aggregate - name: max_review_length - aggregator_path: implementations/aggregators/max_length.py - inputs: - columns: - col: review diff --git a/examples/reviews/resources/models.yaml b/examples/reviews/resources/models.yaml index c6aa6ca629..c4f7a9396c 100644 --- a/examples/reviews/resources/models.yaml +++ b/examples/reviews/resources/models.yaml @@ -1,28 +1,28 @@ -- kind: model - name: sentiment_dnn - type: classification - target_column: label_indexed - feature_columns: - - embedding_input - aggregates: - - reviews_vocab - hparams: - learning_rate: 0.01 - data_partition_ratio: - training: 0.8 - evaluation: 0.2 - training: - batch_size: 64 - num_steps: 5000 +# - kind: model +# name: sentiment_dnn +# estimator: +# target_column: label_indexed +# feature_columns: +# - embedding_input +# aggregates: +# - reviews_vocab +# hparams: +# learning_rate: 0.01 +# data_partition_ratio: +# training: 0.8 +# evaluation: 0.2 +# training: +# batch_size: 64 +# num_steps: 5000 - kind: model name: sentiment_linear - type: classification - target_column: label_indexed - feature_columns: - - embedding_input - aggregates: - - reviews_vocab + estimator: cortex.linear_classifier + target_column: @label_indexed + input: + categorical_columns_with_vocab: + - col: @embedding_input + vocab: @reviews_vocab data_partition_ratio: training: 0.8 evaluation: 0.2 @@ -30,23 +30,23 @@ batch_size: 64 num_steps: 5000 -- kind: model - name: transformer - type: classification - target_column: label_indexed - feature_columns: - - embedding_input - aggregates: - - max_review_length - - reviews_vocab - prediction_key: outputs - data_partition_ratio: - training: 0.8 - evaluation: 0.2 - training: - batch_size: 16 - num_steps: 250000 - evaluation: - start_delay_secs: 1 - compute: - gpu: 1 +# - kind: model +# name: transformer +# type: classification +# target_column: label_indexed +# feature_columns: +# - embedding_input +# aggregates: +# - max_review_length +# - reviews_vocab +# prediction_key: outputs +# data_partition_ratio: +# training: 0.8 +# evaluation: 0.2 +# training: +# batch_size: 16 +# num_steps: 250000 +# evaluation: +# start_delay_secs: 1 +# compute: +# gpu: 1 diff --git a/examples/reviews/resources/vocab.yaml b/examples/reviews/resources/vocab.yaml deleted file mode 100644 index a2da56f53f..0000000000 --- a/examples/reviews/resources/vocab.yaml +++ /dev/null @@ -1,15 +0,0 @@ -- kind: aggregate - name: reviews_vocab - aggregator_path: implementations/aggregators/vocab.py - inputs: - columns: - col: review - args: - vocab_size: 10000 - -- kind: aggregate - name: label_index - aggregator: cortex.index_string - inputs: - columns: - col: label From 6f0b0cf492609e95325a9cbfa2a4ae268a6ea0d9 Mon Sep 17 00:00:00 2001 From: vishal Date: Thu, 13 Jun 2019 20:52:00 +0000 Subject: [PATCH 2/4] Update round 2 --- examples/mnist/implementations/models/dnn.py | 2 +- .../transformers/decode_and_normalize.py | 6 +- examples/mnist/resources/apis.yaml | 6 +- examples/mnist/resources/environments.yaml | 25 ------ examples/mnist/resources/models.yaml | 7 +- .../mnist/resources/transformed_columns.yaml | 1 - .../implementations/models/basic_embedding.py | 22 ++---- .../movie-ratings/resources/environments.yaml | 22 ------ examples/movie-ratings/resources/models.yaml | 16 ++-- .../resources/transformed_columns.yaml | 33 -------- .../implementations/models/sentiment_dnn.py | 2 +- .../models/sentiment_linear.py | 2 +- .../implementations/models/transformer.py | 4 +- .../transformers/tokenize_string_to_int.py | 9 ++- examples/reviews/resources/apis.yaml | 30 ++++---- examples/reviews/resources/columns.yaml | 43 ----------- examples/reviews/resources/models.yaml | 77 +++++++++---------- 17 files changed, 85 insertions(+), 222 deletions(-) delete mode 100644 examples/mnist/resources/environments.yaml delete mode 100644 examples/mnist/resources/transformed_columns.yaml delete mode 100644 examples/movie-ratings/resources/environments.yaml delete mode 100644 examples/movie-ratings/resources/transformed_columns.yaml delete mode 100644 examples/reviews/resources/columns.yaml diff --git a/examples/mnist/implementations/models/dnn.py b/examples/mnist/implementations/models/dnn.py index 4f5b9fe1dd..fbf317106f 100644 --- a/examples/mnist/implementations/models/dnn.py +++ b/examples/mnist/implementations/models/dnn.py @@ -4,7 +4,7 @@ def create_estimator(run_config, model_config): feature_columns = [ tf.feature_column.numeric_column( - model_config["input"]["image_pixels"], shape=model_config["hparams"]["input_shape"] + model_config["input"], shape=model_config["hparams"]["input_shape"] ) ] diff --git a/examples/mnist/implementations/transformers/decode_and_normalize.py b/examples/mnist/implementations/transformers/decode_and_normalize.py index 2ff41440e3..4933445c11 100644 --- a/examples/mnist/implementations/transformers/decode_and_normalize.py +++ b/examples/mnist/implementations/transformers/decode_and_normalize.py @@ -5,10 +5,8 @@ import math -def transform_python(sample, args): - image = sample["image"] - - decoded = base64.b64decode(image) +def transform_python(input): + decoded = base64.b64decode(input) decoded_image = np.asarray(Image.open(BytesIO(decoded)), dtype=np.uint8) # reimplmenting tf.per_image_standardization diff --git a/examples/mnist/resources/apis.yaml b/examples/mnist/resources/apis.yaml index 3bab92bd2a..5b1b966892 100644 --- a/examples/mnist/resources/apis.yaml +++ b/examples/mnist/resources/apis.yaml @@ -1,17 +1,17 @@ - kind: api name: dnn-classifier - model_name: dnn + model: @dnn compute: replicas: 1 - kind: api name: conv-classifier - model_name: conv + model: @conv compute: replicas: 1 - kind: api name: t2t-classifier - model_name: t2t + model: @t2t compute: replicas: 1 diff --git a/examples/mnist/resources/environments.yaml b/examples/mnist/resources/environments.yaml deleted file mode 100644 index 3eee3607e8..0000000000 --- a/examples/mnist/resources/environments.yaml +++ /dev/null @@ -1,25 +0,0 @@ -- kind: environment - name: dev - data: - type: csv - path: s3a://cortex-examples/mnist.csv - csv_config: - header: true - schema: [@image, @label] - -- kind: raw_column - name: image - type: STRING_COLUMN - required: true - -- kind: raw_column - name: label - type: INT_COLUMN - required: true - min: 0 - max: 9 - -- kind: transformed_column - name: image_pixels - transformer_path: implementations/transformers/decode_and_normalize.py - input: @image diff --git a/examples/mnist/resources/models.yaml b/examples/mnist/resources/models.yaml index fac8d751ac..a7f41c6118 100644 --- a/examples/mnist/resources/models.yaml +++ b/examples/mnist/resources/models.yaml @@ -2,8 +2,7 @@ name: dnn estimator_path: implementations/models/dnn.py target_column: @label - input: - image: @image_pixels + input: @image_pixels hparams: input_shape: [784] learning_rate: 0.01 @@ -16,7 +15,7 @@ name: conv estimator_path: implementations/models/custom.py target_column: @label - feature_columns: [@image_pixels] + input: @image_pixels hparams: layer_type: conv learning_rate: 0.01 @@ -36,7 +35,7 @@ name: t2t estimator_path: implementations/models/t2t.py target_column: @label - feature_columns: [@image_pixels] + input: @image_pixels prediction_key: outputs hparams: input_shape: [28, 28, 1] diff --git a/examples/mnist/resources/transformed_columns.yaml b/examples/mnist/resources/transformed_columns.yaml deleted file mode 100644 index 8b13789179..0000000000 --- a/examples/mnist/resources/transformed_columns.yaml +++ /dev/null @@ -1 +0,0 @@ - diff --git a/examples/movie-ratings/implementations/models/basic_embedding.py b/examples/movie-ratings/implementations/models/basic_embedding.py index d2e64c43d5..3a0a507b3e 100644 --- a/examples/movie-ratings/implementations/models/basic_embedding.py +++ b/examples/movie-ratings/implementations/models/basic_embedding.py @@ -2,26 +2,18 @@ def create_estimator(run_config, model_config): - user_id_index = model_config["aggregates"]["user_id_index"] - movie_id_index = model_config["aggregates"]["movie_id_index"] - - feature_columns = [ - tf.feature_column.embedding_column( - tf.feature_column.categorical_column_with_identity( - "user_id_indexed", len(user_id_index) - ), - model_config["hparams"]["embedding_size"], - ), - tf.feature_column.embedding_column( + embedding_feature_columns = [] + for feature_col_data in model_config["input"]["embedding_columns"]: + embedding_col = tf.feature_column.embedding_column( tf.feature_column.categorical_column_with_identity( - "movie_id_indexed", len(movie_id_index) + feature_col_data["col"], len(feature_col_data["vocab"]["index"]) ), model_config["hparams"]["embedding_size"], - ), - ] + ) + embedding_feature_columns.append(embedding_col) return tf.estimator.DNNRegressor( - feature_columns=feature_columns, + feature_columns=embedding_feature_columns, hidden_units=model_config["hparams"]["hidden_units"], config=run_config, ) diff --git a/examples/movie-ratings/resources/environments.yaml b/examples/movie-ratings/resources/environments.yaml deleted file mode 100644 index 8094c2c993..0000000000 --- a/examples/movie-ratings/resources/environments.yaml +++ /dev/null @@ -1,22 +0,0 @@ -- kind: environment - name: dev - log_level: - spark: "ALL" - data: - type: csv - path: s3a://cortex-examples/movie-ratings.csv - csv_config: - header: true - schema: [@user_id, @movie_id, @rating, @timestamp] - -- kind: raw_column - name: user_id - type: STRING_COLUMN - -- kind: raw_column - name: movie_id - type: STRING_COLUMN - -- kind: raw_column - name: rating - type: FLOAT_COLUMN diff --git a/examples/movie-ratings/resources/models.yaml b/examples/movie-ratings/resources/models.yaml index 7361265327..639e2df7bc 100644 --- a/examples/movie-ratings/resources/models.yaml +++ b/examples/movie-ratings/resources/models.yaml @@ -1,16 +1,16 @@ - kind: model name: basic_embedding - estimator: cortex.dnn_regressor + estimator_path: implementations/models/basic_embedding.py target_column: @rating input: - categorical_columns_with_identity: - - col @user_id_indexed - - movie_id_indexed] - aggregates: [user_id_index, movie_id_index] + embedding_columns: + - col: @user_id_indexed + vocab: @user_id_index + - col: @movie_id_indexed + vocab: @movie_id_index hparams: - embedding_size: 10 - hidden_units: [128] + embedding_size: 20 + hidden_units: [10, 10] data_partition_ratio: training: 0.8 evaluation: 0.2 diff --git a/examples/movie-ratings/resources/transformed_columns.yaml b/examples/movie-ratings/resources/transformed_columns.yaml deleted file mode 100644 index 76dc9ffcfc..0000000000 --- a/examples/movie-ratings/resources/transformed_columns.yaml +++ /dev/null @@ -1,33 +0,0 @@ -- kind: aggregate - name: user_id_size - aggregator: cortex.count_distinct - input: @user_id - -- kind: aggregate - name: user_id_index - aggregator: cortex.index_string - input: @user_id - -- kind: transformed_column - name: user_id_indexed - transformer: cortex.index_string - input: - col: user_id - indexes: user_id_index - -- kind: aggregate - name: movie_id_size - aggregator: cortex.count_distinct - input: @movie_id - -- kind: aggregate - name: movie_id_index - aggregator: cortex.index_string - input: @movie_id - -- kind: transformed_column - name: movie_id_indexed - transformer: cortex.index_string - input: - col: movie_id - indexes: movie_id_index diff --git a/examples/reviews/implementations/models/sentiment_dnn.py b/examples/reviews/implementations/models/sentiment_dnn.py index b6506000b9..c9aa1155cf 100644 --- a/examples/reviews/implementations/models/sentiment_dnn.py +++ b/examples/reviews/implementations/models/sentiment_dnn.py @@ -4,7 +4,7 @@ def create_estimator(run_config, model_config): hparams = model_config["hparams"] - vocab_size = len(model_config["aggregates"]["reviews_vocab"]) + vocab_size = len(model_config["input"]["vocab"]) def model_fn(features, labels, mode, params): embedding_input = features["embedding_input"] diff --git a/examples/reviews/implementations/models/sentiment_linear.py b/examples/reviews/implementations/models/sentiment_linear.py index 1795e4cec7..6b1e512c14 100644 --- a/examples/reviews/implementations/models/sentiment_linear.py +++ b/examples/reviews/implementations/models/sentiment_linear.py @@ -2,7 +2,7 @@ def create_estimator(run_config, model_config): - vocab_size = len(model_config["aggregates"]["reviews_vocab"]) + vocab_size = len(model_config["input"]["vocab"]) feature_column = tf.feature_column.categorical_column_with_identity( "embedding_input", vocab_size ) diff --git a/examples/reviews/implementations/models/transformer.py b/examples/reviews/implementations/models/transformer.py index 0b03d52825..f4442abddc 100644 --- a/examples/reviews/implementations/models/transformer.py +++ b/examples/reviews/implementations/models/transformer.py @@ -13,7 +13,7 @@ def create_estimator(run_config, model_config): hparams = trainer_lib.create_hparams("transformer_base_single_gpu") # SentimentIMDBCortex subclasses SentimentIMDB - problem = SentimentIMDBCortex(list(model_config["aggregates"]["reviews_vocab"])) + problem = SentimentIMDBCortex(list(model_config["input"]["vocab"])) hparams.problem = problem hparams.problem_hparams = problem.get_hparams(hparams) @@ -39,7 +39,7 @@ def create_estimator(run_config, model_config): def transform_tensorflow(features, labels, model_config): - max_length = model_config["aggregates"]["max_review_length"] + max_length = model_config["input"]["max_review_length"] features["inputs"] = tf.expand_dims(tf.reshape(features["embedding_input"], [max_length]), -1) features["targets"] = tf.expand_dims(tf.expand_dims(labels, -1), -1) diff --git a/examples/reviews/implementations/transformers/tokenize_string_to_int.py b/examples/reviews/implementations/transformers/tokenize_string_to_int.py index 151c18f754..accd245f4f 100644 --- a/examples/reviews/implementations/transformers/tokenize_string_to_int.py +++ b/examples/reviews/implementations/transformers/tokenize_string_to_int.py @@ -4,18 +4,19 @@ def transform_python(input): - text = input["col"].lower() + lol = input + text = lol["col"].lower() token_index_list = [] - vocab = input["vocab"] + vocab = lol["vocab"] for token in non_word.split(text): if len(token) == 0: continue token_index_list.append(vocab.get(token, vocab[""])) - if len(token_index_list) == input["max_len"]: + if len(token_index_list) == lol["max_len"]: break - for i in range(input["max_len"] - len(token_index_list)): + for i in range(lol["max_len"] - len(token_index_list)): token_index_list.append(vocab[""]) return token_index_list diff --git a/examples/reviews/resources/apis.yaml b/examples/reviews/resources/apis.yaml index 6c55273ac3..0819d5eba4 100644 --- a/examples/reviews/resources/apis.yaml +++ b/examples/reviews/resources/apis.yaml @@ -1,17 +1,17 @@ -# - kind: api -# name: sentiment-dnn -# model_name: sentiment_dnn -# compute: -# replicas: 1 +- kind: api + name: sentiment-dnn + model: @sentiment_dnn + compute: + replicas: 1 -# - kind: api -# name: sentiment-linear -# model: @sentiment_linear -# compute: -# replicas: 1 +- kind: api + name: sentiment-linear + model: @sentiment_linear + compute: + replicas: 1 -# - kind: api -# name: sentiment-t2t -# model_name: transformer -# compute: -# replicas: 1 +- kind: api + name: sentiment-t2t + model: @transformer + compute: + replicas: 1 diff --git a/examples/reviews/resources/columns.yaml b/examples/reviews/resources/columns.yaml deleted file mode 100644 index 09fa2a033b..0000000000 --- a/examples/reviews/resources/columns.yaml +++ /dev/null @@ -1,43 +0,0 @@ -- kind: environment - name: dev - log_level: - spark: INFO - data: - type: csv - path: s3a://cortex-examples/reviews.csv - csv_config: - header: true - escape: "\"" - schema: [@review, @label] - -- kind: aggregate - name: max_review_length - aggregator_path: implementations/aggregators/max_length.py - input: @review - -- kind: aggregate - name: reviews_vocab - aggregator_path: implementations/aggregators/vocab.py - input: - col: @review - vocab_size: 10000 - -- kind: aggregate - name: label_index - aggregator: cortex.index_string - input: @label - -- kind: transformed_column - name: embedding_input - transformer_path: implementations/transformers/tokenize_string_to_int.py - input: - col: @review - max_len: @max_review_length - vocab: @reviews_vocab - -- kind: transformed_column - name: label_indexed - transformer: cortex.index_string - input: - col: @label - indexes: @label_index diff --git a/examples/reviews/resources/models.yaml b/examples/reviews/resources/models.yaml index c4f7a9396c..fe46218e33 100644 --- a/examples/reviews/resources/models.yaml +++ b/examples/reviews/resources/models.yaml @@ -1,28 +1,26 @@ -# - kind: model -# name: sentiment_dnn -# estimator: -# target_column: label_indexed -# feature_columns: -# - embedding_input -# aggregates: -# - reviews_vocab -# hparams: -# learning_rate: 0.01 -# data_partition_ratio: -# training: 0.8 -# evaluation: 0.2 -# training: -# batch_size: 64 -# num_steps: 5000 +- kind: model + name: sentiment_dnn + estimator_path: implementations/models/sentiment_dnn.py + target_column: @label_indexed + input: + embedding_input: @embedding_input + vocab: @reviews_vocab + hparams: + learning_rate: 0.01 + data_partition_ratio: + training: 0.8 + evaluation: 0.2 + training: + batch_size: 64 + num_steps: 5000 - kind: model name: sentiment_linear - estimator: cortex.linear_classifier + estimator_path: implementations/models/sentiment_linear.py target_column: @label_indexed input: - categorical_columns_with_vocab: - - col: @embedding_input - vocab: @reviews_vocab + embedding_input: @embedding_input + vocab: @reviews_vocab data_partition_ratio: training: 0.8 evaluation: 0.2 @@ -30,23 +28,22 @@ batch_size: 64 num_steps: 5000 -# - kind: model -# name: transformer -# type: classification -# target_column: label_indexed -# feature_columns: -# - embedding_input -# aggregates: -# - max_review_length -# - reviews_vocab -# prediction_key: outputs -# data_partition_ratio: -# training: 0.8 -# evaluation: 0.2 -# training: -# batch_size: 16 -# num_steps: 250000 -# evaluation: -# start_delay_secs: 1 -# compute: -# gpu: 1 +- kind: model + name: transformer + estimator_path: implementations/models/transformer.py + target_column: @label_indexed + input: + embedding_input: @embedding_input + max_review_length: @max_review_length + vocab: @reviews_vocab + prediction_key: outputs + data_partition_ratio: + training: 0.8 + evaluation: 0.2 + training: + batch_size: 16 + num_steps: 250000 + evaluation: + start_delay_secs: 1 + compute: + gpu: 1 From 6071f4e01f48c6d104fc833dadd13bbd33254b14 Mon Sep 17 00:00:00 2001 From: vishal Date: Thu, 13 Jun 2019 20:56:55 +0000 Subject: [PATCH 3/4] Add data yaml files --- examples/mnist/resources/data.yaml | 25 +++++++++++ examples/movie-ratings/resources/data.yaml | 44 +++++++++++++++++++ .../transformers/tokenize_string_to_int.py | 9 ++-- examples/reviews/resources/data.yaml | 41 +++++++++++++++++ 4 files changed, 114 insertions(+), 5 deletions(-) create mode 100644 examples/mnist/resources/data.yaml create mode 100644 examples/movie-ratings/resources/data.yaml create mode 100644 examples/reviews/resources/data.yaml diff --git a/examples/mnist/resources/data.yaml b/examples/mnist/resources/data.yaml new file mode 100644 index 0000000000..3eee3607e8 --- /dev/null +++ b/examples/mnist/resources/data.yaml @@ -0,0 +1,25 @@ +- kind: environment + name: dev + data: + type: csv + path: s3a://cortex-examples/mnist.csv + csv_config: + header: true + schema: [@image, @label] + +- kind: raw_column + name: image + type: STRING_COLUMN + required: true + +- kind: raw_column + name: label + type: INT_COLUMN + required: true + min: 0 + max: 9 + +- kind: transformed_column + name: image_pixels + transformer_path: implementations/transformers/decode_and_normalize.py + input: @image diff --git a/examples/movie-ratings/resources/data.yaml b/examples/movie-ratings/resources/data.yaml new file mode 100644 index 0000000000..40182c1727 --- /dev/null +++ b/examples/movie-ratings/resources/data.yaml @@ -0,0 +1,44 @@ +- kind: environment + name: dev + data: + type: csv + path: s3a://cortex-examples/movie-ratings.csv + csv_config: + header: true + schema: [@user_id, @movie_id, @rating, @timestamp] + +- kind: raw_column + name: user_id + type: STRING_COLUMN + +- kind: raw_column + name: movie_id + type: STRING_COLUMN + +- kind: raw_column + name: rating + type: FLOAT_COLUMN + +- kind: aggregate + name: user_id_index + aggregator: cortex.index_string + input: @user_id + +- kind: transformed_column + name: user_id_indexed + transformer: cortex.index_string + input: + col: @user_id + indexes: @user_id_index + +- kind: aggregate + name: movie_id_index + aggregator: cortex.index_string + input: @movie_id + +- kind: transformed_column + name: movie_id_indexed + transformer: cortex.index_string + input: + col: @movie_id + indexes: @movie_id_index diff --git a/examples/reviews/implementations/transformers/tokenize_string_to_int.py b/examples/reviews/implementations/transformers/tokenize_string_to_int.py index accd245f4f..151c18f754 100644 --- a/examples/reviews/implementations/transformers/tokenize_string_to_int.py +++ b/examples/reviews/implementations/transformers/tokenize_string_to_int.py @@ -4,19 +4,18 @@ def transform_python(input): - lol = input - text = lol["col"].lower() + text = input["col"].lower() token_index_list = [] - vocab = lol["vocab"] + vocab = input["vocab"] for token in non_word.split(text): if len(token) == 0: continue token_index_list.append(vocab.get(token, vocab[""])) - if len(token_index_list) == lol["max_len"]: + if len(token_index_list) == input["max_len"]: break - for i in range(lol["max_len"] - len(token_index_list)): + for i in range(input["max_len"] - len(token_index_list)): token_index_list.append(vocab[""]) return token_index_list diff --git a/examples/reviews/resources/data.yaml b/examples/reviews/resources/data.yaml new file mode 100644 index 0000000000..9da7e15d9c --- /dev/null +++ b/examples/reviews/resources/data.yaml @@ -0,0 +1,41 @@ +- kind: environment + name: dev + data: + type: csv + path: s3a://cortex-examples/reviews.csv + csv_config: + header: true + escape: "\"" + schema: [@review, @label] + +- kind: aggregate + name: max_review_length + aggregator_path: implementations/aggregators/max_length.py + input: @review + +- kind: aggregate + name: reviews_vocab + aggregator_path: implementations/aggregators/vocab.py + input: + col: @review + vocab_size: 10000 + +- kind: aggregate + name: label_index + aggregator: cortex.index_string + input: @label + +- kind: transformed_column + name: embedding_input + transformer_path: implementations/transformers/tokenize_string_to_int.py + input: + col: @review + max_len: @max_review_length + vocab: @reviews_vocab + +- kind: transformed_column + name: label_indexed + transformer: cortex.index_string + input: + col: @label + indexes: @label_index From d1d6db0b2b1dad091de782e0366030a20a4cd414 Mon Sep 17 00:00:00 2001 From: David Eliahu Date: Thu, 13 Jun 2019 14:15:12 -0700 Subject: [PATCH 4/4] Trim trailing whitespace --- examples/movie-ratings/resources/models.yaml | 2 +- examples/reviews/resources/models.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/movie-ratings/resources/models.yaml b/examples/movie-ratings/resources/models.yaml index 639e2df7bc..6cf6db550a 100644 --- a/examples/movie-ratings/resources/models.yaml +++ b/examples/movie-ratings/resources/models.yaml @@ -2,7 +2,7 @@ name: basic_embedding estimator_path: implementations/models/basic_embedding.py target_column: @rating - input: + input: embedding_columns: - col: @user_id_indexed vocab: @user_id_index diff --git a/examples/reviews/resources/models.yaml b/examples/reviews/resources/models.yaml index fe46218e33..0c353d4508 100644 --- a/examples/reviews/resources/models.yaml +++ b/examples/reviews/resources/models.yaml @@ -1,6 +1,6 @@ - kind: model name: sentiment_dnn - estimator_path: implementations/models/sentiment_dnn.py + estimator_path: implementations/models/sentiment_dnn.py target_column: @label_indexed input: embedding_input: @embedding_input