Skip to content

Commit 52795bc

Browse files
Nick-Harveyk8s-ci-robot
authored andcommitted
Adding Pachyderm Example (squashed) (kubeflow#522)
* Adding Pachyderm Example (squashed) * Add Dan Sanche to OWNERS (kubeflow#520) Fixed tf_operator import for github_issue_summarization example (kubeflow#527) * fixed tf_operator import * updated tf-operator import path * small change * updated PYTHONPATH * fixed syntax error * formating issue Mnist pipelines (kubeflow#524) * added mnist pipelines sample * fixed lint issues
1 parent 895e88b commit 52795bc

18 files changed

+7156
-0
lines changed

README.md

+10
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,16 @@ This example covers the following concepts:
2424
1. Serving with Seldon Core
2525
1. Flask front-end
2626

27+
### [Pachyderm Example - GitHub issue summarization](./github_issue_summarization/Pachyderm_Example)
28+
Author: [Nick Harvey](https://github.com/Nick-Harvey) & [Daniel Whitenack](https://github.com/dwhitena)
29+
30+
This example covers the following concepts:
31+
1. A production pipeline for pre-processing, training, and model export
32+
1. CI/CD for model binaries, building and deploying a docker image for serving in Seldon
33+
1. Full tracking of what data produced which model, and what model is being used for inference
34+
1. Automatic updates of models based on changes to training data or code
35+
1. Training with single node Tensorflow and distributed TF-jobs
36+
2737
### [Pytorch MNIST](./pytorch_mnist)
2838
Author: [David Sabater](https://github.com/dsdinter)
2939

github_issue_summarization/Pachyderm_Example/README.md

+420
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
{
2+
"pipeline": {
3+
"name": "build"
4+
},
5+
"transform": {
6+
"image": "seldonio/core-python-wrapper:0.7",
7+
"cmd": [ "/bin/bash" ],
8+
"stdin": [
9+
"mkdir /my_model",
10+
"cp /pfs/pre_process/*.dpkl /my_model",
11+
"cp /pfs/train/* /my_model",
12+
"python wrap_model.py /my_model IssueSummarization $PACH_JOB_ID pachyderm --out-folder=/pfs/out --base-image=python:3.6"
13+
]
14+
},
15+
"input": {
16+
"cross": [
17+
{
18+
"atom": {
19+
"repo": "train",
20+
"glob": "/"
21+
}
22+
},
23+
{
24+
"atom": {
25+
"repo": "pre_process",
26+
"glob": "/"
27+
}
28+
}
29+
]
30+
}
31+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
FROM python:3.6
2+
RUN apt-get update && apt-get install -y --no-install-recommends \
3+
python-pandas \
4+
&& pip3 install -U scikit-learn \
5+
&& pip3 install -U ktext \
6+
&& pip3 install -U IPython \
7+
&& pip3 install -U annoy \
8+
&& pip3 install -U tqdm \
9+
&& pip3 install -U nltk \
10+
&& pip3 install -U matplotlib \
11+
&& pip3 install -U tensorflow \
12+
&& pip3 install -U bernoulli \
13+
&& pip3 install -U h5py \
14+
&& git clone https://github.com/google/seq2seq.git \
15+
&& pip3 install -e ./seq2seq/ \
16+
&& apt-get clean \
17+
&& rm -rf \
18+
/var/lib/apt/lists/* \
19+
/tmp/* \
20+
/var/tmp/* \
21+
/usr/share/man \
22+
/usr/share/doc \
23+
/usr/share/doc-base
24+
COPY . /workspace/src/
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
"""Generates predictions using a stored model.
2+
3+
Uses trained model files to generate a prediction.
4+
"""
5+
6+
from __future__ import print_function
7+
8+
import os
9+
10+
import numpy as np
11+
import dill as dpickle
12+
from keras.models import load_model
13+
from seq2seq_utils import Seq2Seq_Inference
14+
15+
class IssueSummarization(object):
16+
17+
def __init__(self):
18+
body_pp_file = os.getenv('BODY_PP_FILE', 'body_preprocessor.dpkl')
19+
print('body_pp file {0}'.format(body_pp_file))
20+
with open(body_pp_file, 'rb') as body_file:
21+
body_pp = dpickle.load(body_file)
22+
23+
title_pp_file = os.getenv('TITLE_PP_FILE', 'title_preprocessor.dpkl')
24+
print('title_pp file {0}'.format(title_pp_file))
25+
with open(title_pp_file, 'rb') as title_file:
26+
title_pp = dpickle.load(title_file)
27+
28+
model_file = os.getenv('MODEL_FILE', 'output_model.h5')
29+
print('model file {0}'.format(model_file))
30+
self.model = Seq2Seq_Inference(encoder_preprocessor=body_pp,
31+
decoder_preprocessor=title_pp,
32+
seq2seq_model=load_model(model_file))
33+
34+
def predict(self, input_text, feature_names): # pylint: disable=unused-argument
35+
return np.asarray([[self.model.generate_issue_title(body[0])[1]] for body in input_text])
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import argparse
2+
import keras
3+
import pandas as pd
4+
from seq2seq_utils import load_text_processor
5+
from seq2seq_utils import Seq2Seq_Inference
6+
7+
# Parsing flags.
8+
parser = argparse.ArgumentParser()
9+
parser.add_argument("--input_model_h5")
10+
parser.add_argument("--input_body_preprocessor_dpkl")
11+
parser.add_argument("--input_title_preprocessor_dpkl")
12+
parser.add_argument("--input_testdf_csv")
13+
parser.add_argument("--input_prediction_count", type=int, default=50)
14+
args = parser.parse_args()
15+
print(args)
16+
17+
# Read data.
18+
testdf = pd.read_csv(args.input_testdf_csv)
19+
20+
# Load model, preprocessors.
21+
seq2seq_Model = keras.models.load_model(args.input_model_h5)
22+
num_encoder_tokens, body_pp = load_text_processor(args.input_body_preprocessor_dpkl)
23+
num_decoder_tokens, title_pp = load_text_processor(args.input_title_preprocessor_dpkl)
24+
25+
# Prepare inference.
26+
seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=body_pp,
27+
decoder_preprocessor=title_pp,
28+
seq2seq_model=seq2seq_Model)
29+
30+
# Output predictions for n random rows in the test set.
31+
seq2seq_inf.demo_model_predictions(n=args.input_prediction_count, issue_df=testdf)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import argparse
2+
import dill as dpickle
3+
import numpy as np
4+
from ktext.preprocess import processor
5+
import pandas as pd
6+
7+
# Parsing flags.
8+
parser = argparse.ArgumentParser()
9+
parser.add_argument("--input_traindf_csv")
10+
parser.add_argument("--output_body_preprocessor_dpkl")
11+
parser.add_argument("--output_title_preprocessor_dpkl")
12+
parser.add_argument("--output_train_title_vecs_npy")
13+
parser.add_argument("--output_train_body_vecs_npy")
14+
args = parser.parse_args()
15+
print(args)
16+
17+
# Read data.
18+
traindf = pd.read_csv(args.input_traindf_csv)
19+
train_body_raw = traindf.body.tolist()
20+
train_title_raw = traindf.issue_title.tolist()
21+
22+
# Clean, tokenize, and apply padding / truncating such that each document
23+
# length = 70. Also, retain only the top 8,000 words in the vocabulary and set
24+
# the remaining words to 1 which will become common index for rare words.
25+
body_pp = processor(keep_n=8000, padding_maxlen=70)
26+
train_body_vecs = body_pp.fit_transform(train_body_raw)
27+
28+
print('Example original body:', train_body_raw[0])
29+
print('Example body after pre-processing:', train_body_vecs[0])
30+
31+
# Instantiate a text processor for the titles, with some different parameters.
32+
title_pp = processor(append_indicators=True, keep_n=4500,
33+
padding_maxlen=12, padding='post')
34+
35+
# process the title data
36+
train_title_vecs = title_pp.fit_transform(train_title_raw)
37+
38+
print('Example original title:', train_title_raw[0])
39+
print('Example title after pre-processing:', train_title_vecs[0])
40+
41+
# Save the preprocessor.
42+
with open(args.output_body_preprocessor_dpkl, 'wb') as f:
43+
dpickle.dump(body_pp, f, protocol=2)
44+
45+
with open(args.output_title_preprocessor_dpkl, 'wb') as f:
46+
dpickle.dump(title_pp, f, protocol=2)
47+
48+
# Save the processed data.
49+
np.save(args.output_train_title_vecs_npy, train_title_vecs)
50+
np.save(args.output_train_body_vecs_npy, train_body_vecs)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import argparse
2+
import pandas as pd
3+
from sklearn.model_selection import train_test_split
4+
5+
# Parsing flags.
6+
parser = argparse.ArgumentParser()
7+
parser.add_argument("--input_csv")
8+
parser.add_argument("--sample_size", type=int, default=2000000)
9+
parser.add_argument("--output_traindf_csv")
10+
parser.add_argument("--output_testdf_csv")
11+
args = parser.parse_args()
12+
print(args)
13+
14+
pd.set_option('display.max_colwidth', 500)
15+
16+
# Read in data sample 2M rows (for speed of tutorial)
17+
traindf, testdf = train_test_split(pd.read_csv(args.input_csv).sample(n=args.sample_size),
18+
test_size=.10)
19+
20+
# Print stats about the shape of the data.
21+
print('Train: {:,} rows {:,} columns'.format(traindf.shape[0], traindf.shape[1]))
22+
print('Test: {:,} rows {:,} columns'.format(testdf.shape[0], testdf.shape[1]))
23+
24+
# Store output as CSV.
25+
traindf.to_csv(args.output_traindf_csv)
26+
testdf.to_csv(args.output_testdf_csv)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import argparse
2+
import keras
3+
import pandas as pd
4+
from seq2seq_utils import load_text_processor
5+
from seq2seq_utils import Seq2Seq_Inference
6+
7+
# Parsing flags.
8+
parser = argparse.ArgumentParser()
9+
parser.add_argument("--input_csv")
10+
parser.add_argument("--input_model_h5")
11+
parser.add_argument("--input_body_preprocessor_dpkl")
12+
parser.add_argument("--input_title_preprocessor_dpkl")
13+
parser.add_argument("--input_testdf_csv")
14+
parser.add_argument("--input_topic_number", type=int, default=1)
15+
args = parser.parse_args()
16+
print(args)
17+
18+
# Read data.
19+
all_data_df = pd.read_csv(args.input_csv)
20+
testdf = pd.read_csv(args.input_testdf_csv)
21+
22+
# Load model, preprocessors.
23+
num_encoder_tokens, body_pp = load_text_processor(args.input_body_preprocessor_dpkl)
24+
num_decoder_tokens, title_pp = load_text_processor(args.input_title_preprocessor_dpkl)
25+
seq2seq_Model = keras.models.load_model(args.input_model_h5)
26+
27+
# Prepare the recommender.
28+
all_data_bodies = all_data_df['body'].tolist()
29+
all_data_vectorized = body_pp.transform_parallel(all_data_bodies)
30+
seq2seq_inf_rec = Seq2Seq_Inference(encoder_preprocessor=body_pp,
31+
decoder_preprocessor=title_pp,
32+
seq2seq_model=seq2seq_Model)
33+
recsys_annoyobj = seq2seq_inf_rec.prepare_recommender(all_data_vectorized, all_data_df)
34+
35+
# Output recommendations for n topics.
36+
seq2seq_inf_rec.demo_model_predictions(n=args.input_topic_number, issue_df=testdf, threshold=1)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
numpy
2+
keras
3+
dill
4+
matplotlib
5+
tensorflow
6+
annoy
7+
tqdm
8+
nltk
9+
IPython
10+
ktext
11+
h5py

0 commit comments

Comments
 (0)