|
| 1 | +import argparse |
| 2 | +import dill as dpickle |
| 3 | +import numpy as np |
| 4 | +from ktext.preprocess import processor |
| 5 | +import pandas as pd |
| 6 | + |
| 7 | +# Parsing flags. |
| 8 | +parser = argparse.ArgumentParser() |
| 9 | +parser.add_argument("--input_traindf_csv") |
| 10 | +parser.add_argument("--output_body_preprocessor_dpkl") |
| 11 | +parser.add_argument("--output_title_preprocessor_dpkl") |
| 12 | +parser.add_argument("--output_train_title_vecs_npy") |
| 13 | +parser.add_argument("--output_train_body_vecs_npy") |
| 14 | +args = parser.parse_args() |
| 15 | +print(args) |
| 16 | + |
| 17 | +# Read data. |
| 18 | +traindf = pd.read_csv(args.input_traindf_csv) |
| 19 | +train_body_raw = traindf.body.tolist() |
| 20 | +train_title_raw = traindf.issue_title.tolist() |
| 21 | + |
| 22 | +# Clean, tokenize, and apply padding / truncating such that each document |
| 23 | +# length = 70. Also, retain only the top 8,000 words in the vocabulary and set |
| 24 | +# the remaining words to 1 which will become common index for rare words. |
| 25 | +body_pp = processor(keep_n=8000, padding_maxlen=70) |
| 26 | +train_body_vecs = body_pp.fit_transform(train_body_raw) |
| 27 | + |
| 28 | +print('Example original body:', train_body_raw[0]) |
| 29 | +print('Example body after pre-processing:', train_body_vecs[0]) |
| 30 | + |
| 31 | +# Instantiate a text processor for the titles, with some different parameters. |
| 32 | +title_pp = processor(append_indicators=True, keep_n=4500, |
| 33 | + padding_maxlen=12, padding='post') |
| 34 | + |
| 35 | +# process the title data |
| 36 | +train_title_vecs = title_pp.fit_transform(train_title_raw) |
| 37 | + |
| 38 | +print('Example original title:', train_title_raw[0]) |
| 39 | +print('Example title after pre-processing:', train_title_vecs[0]) |
| 40 | + |
| 41 | +# Save the preprocessor. |
| 42 | +with open(args.output_body_preprocessor_dpkl, 'wb') as f: |
| 43 | + dpickle.dump(body_pp, f, protocol=2) |
| 44 | + |
| 45 | +with open(args.output_title_preprocessor_dpkl, 'wb') as f: |
| 46 | + dpickle.dump(title_pp, f, protocol=2) |
| 47 | + |
| 48 | +# Save the processed data. |
| 49 | +np.save(args.output_train_title_vecs_npy, train_title_vecs) |
| 50 | +np.save(args.output_train_body_vecs_npy, train_body_vecs) |
0 commit comments