Skip to content

Commit 834532c

Browse files
committed
Add support to run precprocess and tfdv on flink
1 parent 73760b2 commit 834532c

11 files changed

+808
-3
lines changed

.gitignore

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
# NOTE: if you modify this file, you probably need to modify the file set that
2+
# is an input to 'maven-assembly-plugin' that generates source distribution.
3+
# This is typically in files named 'src.xml' throughout this repository.
4+
5+
# Ignore any offline repositories the user may have created.
6+
**/offline-repository/**/*
7+
8+
# Ignore files generated by the Gradle build process.
9+
**/.gradle/**/*
10+
**/.gogradle/**/*
11+
**/gogradle.lock
12+
**/build/**/*
13+
sdks/**/vendor/**/*
14+
runners/**/vendor/**/*
15+
**/.gradletasknamecache
16+
17+
# Ignore files generated by the Maven build process.
18+
**/bin/**/*
19+
**/dependency-reduced-pom.xml
20+
**/target/**/*
21+
22+
# Ignore generated archetypes
23+
sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/
24+
sdks/java/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/
25+
26+
# Ignore files generated by the Python build process.
27+
**/*.pyc
28+
**/*.pyo
29+
**/*.pyd
30+
**/*.egg-info/
31+
**/.eggs/
32+
**/nose-*.egg/
33+
**/.tox/**/*
34+
**/dist/**/*
35+
**/distribute-*/**/*
36+
**/env/**/*
37+
sdks/python/**/*.c
38+
sdks/python/**/*.so
39+
sdks/python/**/*.egg
40+
sdks/python/LICENSE
41+
sdks/python/NOTICE
42+
sdks/python/README.md
43+
sdks/python/apache_beam/portability/api/*pb2*.*
44+
sdks/python/nosetests.xml
45+
46+
# Igonore data files
47+
**/*data/
48+
**/data/
49+
50+
# Ignore IntelliJ files.
51+
**/.idea/**/*
52+
**/*.iml
53+
**/*.ipr
54+
**/*.iws
55+
**/out/**/*
56+
57+
# Ignore Eclipse files.
58+
**/.classpath
59+
**/.project
60+
**/.factorypath
61+
**/.checkstyle
62+
**/.fbExcludeFilterFile
63+
**/.apt_generated/**/*
64+
**/.settings/**/*
65+
66+
# Ignore Visual Studio Code files.
67+
**/.vscode/**/*
68+
69+
# Hotspot VM leaves this log in a non-target directory when java crashes
70+
**/hs_err_pid*.log
71+
72+
# Ignore files that end with '~', since they are most likely auto-save files
73+
# produced by a text editor.
74+
**/*~
75+
76+
# Ignore MacOSX files.
77+
**/.DS_Store/**/*
78+
**/.DS_Store
79+
80+
# Ignore Jupyter notebook checkpoints.
81+
**/.ipynb_checkpoints/**/*
82+
83+
# NOTE: if you modify this file, you probably need to modify the file set that
84+
# is an input to 'maven-assembly-plugin' that generates source distribution.
85+
# This is typically in files named 'src.xml' throughout this repository.
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
absl-py==0.6.1
2+
apache-beam==2.10.0.dev0
3+
astor==0.7.1
4+
avro==1.8.2
5+
backports-abc==0.5
6+
backports.shutil-get-terminal-size==1.0.0
7+
backports.weakref==1.0.post1
8+
bleach==3.0.2
9+
cachetools==3.0.0
10+
certifi==2018.10.15
11+
chardet==3.0.4
12+
configparser==3.5.0
13+
crcmod==1.7
14+
decorator==4.3.0
15+
defusedxml==0.5.0
16+
dill==0.2.8.2
17+
docopt==0.6.2
18+
entrypoints==0.2.3
19+
enum34==1.1.6
20+
fastavro==0.21.13
21+
fasteners==0.14.1
22+
funcsigs==1.0.2
23+
functools32==3.2.3.post2
24+
future==0.17.1
25+
futures==3.2.0
26+
gast==0.2.0
27+
google-api-core==1.5.2
28+
google-apitools==0.5.24
29+
google-auth==1.6.1
30+
google-auth-httplib2==0.0.3
31+
google-cloud-bigquery==1.6.0
32+
google-cloud-core==0.28.1
33+
google-cloud-dataproc==0.2.0
34+
google-cloud-pubsub==0.35.4
35+
google-resumable-media==0.3.1
36+
googleapis-common-protos==1.5.3
37+
googledatastore==7.0.1
38+
grpc==0.3.post19
39+
grpc-google-iam-v1==0.11.1
40+
grpcio==1.8.6
41+
hdfs==2.1.0
42+
httplib2==0.9.2
43+
idna==2.7
44+
ipaddress==1.0.22
45+
ipykernel==4.10.0
46+
ipython==5.8.0
47+
ipython-genutils==0.2.0
48+
ipywidgets==7.4.2
49+
Jinja2==2.10
50+
jsonschema==2.6.0
51+
jupyter==1.0.0
52+
jupyter-client==5.2.3
53+
jupyter-console==5.2.0
54+
jupyter-core==4.4.0
55+
Markdown==3.0.1
56+
MarkupSafe==1.1.0
57+
mistune==0.8.4
58+
mock==2.0.0
59+
monotonic==1.5
60+
msgpack-python==0.5.6
61+
nbconvert==5.4.0
62+
nbformat==4.4.0
63+
notebook==5.7.0
64+
numpy==1.13.3
65+
oauth2client==3.0.0
66+
pandas==0.23.4
67+
pandocfilters==1.4.2
68+
pathlib2==2.3.2
69+
pbr==5.1.1
70+
pexpect==4.6.0
71+
pickleshare==0.7.5
72+
prometheus-client==0.4.2
73+
prompt-toolkit==1.0.15
74+
proto-google-cloud-datastore-v1==0.90.4
75+
proto-google-cloud-pubsub-v1==0.15.4
76+
protobuf==3.6.0
77+
ptyprocess==0.6.0
78+
pyasn1==0.4.4
79+
pyasn1-modules==0.2.2
80+
pydot==1.2.4
81+
Pygments==2.2.0
82+
pyparsing==2.3.0
83+
python-dateutil==2.7.5
84+
pytz==2018.4
85+
PyVCF==0.6.8
86+
PyYAML==3.13
87+
pyzmq==17.1.2
88+
qtconsole==4.4.3
89+
requests==2.20.1
90+
rsa==4.0
91+
scandir==1.9.0
92+
Send2Trash==1.5.0
93+
simplegeneric==0.8.1
94+
singledispatch==3.4.0.3
95+
six==1.11.0
96+
tensorboard==1.9.0
97+
tensorflow==1.9.0
98+
tensorflow-data-validation==0.9.0
99+
tensorflow-metadata==0.9.0
100+
tensorflow-model-analysis==0.9.2
101+
tensorflow-serving-api==1.9.0
102+
tensorflow-transform==0.9.0
103+
termcolor==1.1.0
104+
terminado==0.8.1
105+
testpath==0.4.2
106+
tfx-chicago-taxi==0.9.2
107+
tornado==5.1.1
108+
traitlets==4.3.2
109+
typing==3.6.6
110+
urllib3==1.24.1
111+
wcwidth==0.1.7
112+
webencodings==0.5.1
113+
Werkzeug==0.14.1
114+
widgetsnbextension==3.4.2
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
#!/usr/bin/env bash
2+
3+
#set -x
4+
set -e
5+
USER=`whoami`
6+
CLUSTER_NAME="$USER-flink-156"
7+
NUM_WORKERS=2
8+
FLINK_VERSION=1.5.6
9+
WORK_DIR="gs://clouddfe-$USER/tmp"
10+
CLOUD_WORKER_IMAGE="gcr.io/dataflow-build/$USER/beam_fnapi_python:latest"
11+
TASK_MANAGER_MEM=10240
12+
FLINK_LOCAL_PORT=8081
13+
TASK_MANAGER_SLOTS=1
14+
DATAPROC_VERSION=1.2
15+
16+
MASTER_NAME="$CLUSTER_NAME-m"
17+
FLINK_INIT="$WORK_DIR/flink/flink-init-dataproc.sh"
18+
DOCKER_INIT="$WORK_DIR/flink/docker-init.sh"
19+
LOCAL_WORKER_IMAGE="$USER-docker-apache.bintray.io/beam/python:latest"
20+
FLINK_DOWNLOAD_URL="http://archive.apache.org/dist/flink/flink-$FLINK_VERSION/flink-$FLINK_VERSION-bin-hadoop28-scala_2.11.tgz"
21+
22+
YARN_APPLICATION=""
23+
YARN_APPLICATION_MASTER=""
24+
25+
26+
function is_master() {
27+
local role="$(/usr/share/google/get_metadata_value attributes/dataproc-role)"
28+
if [[ "$role" == 'Master' ]] ; then
29+
true
30+
else
31+
false
32+
fi
33+
}
34+
35+
function get_leader() {
36+
local i=0
37+
local -A application_ids
38+
local -A application_masters
39+
#gcloud compute ssh yarn@$MASTER_NAME --command="yarn application -list" | grep "$CLUSTER_NAME"
40+
echo "Yarn Applications"
41+
while read line; do
42+
echo $line
43+
application_ids[$i]=`echo $line | sed "s/ .*//"`
44+
application_masters[$i]=`echo $line | sed "s/.*$CLUSTER_NAME/$CLUSTER_NAME/" | sed "s/ .*//"`
45+
i=$((i+1))
46+
done <<< $(gcloud compute ssh yarn@$MASTER_NAME --command="yarn application -list" | grep "$CLUSTER_NAME")
47+
48+
if [ $i != 1 ]; then
49+
echo "Multiple applications found. Make sure that only 1 application is running on the cluster."
50+
for app in ${application_ids[*]};
51+
do
52+
echo $app
53+
done
54+
55+
echo "Execute 'gcloud compute ssh yarn@$MASTER_NAME --command=\"yarn application -kill <APP_NAME>\"' to kill the yarn application."
56+
exit 1
57+
fi
58+
59+
YARN_APPLICATION=${application_ids[0]}
60+
YARN_APPLICATION_MASTER=${application_masters[0]}
61+
echo "Using Yarn Application $YARN_APPLICATION $YARN_APPLICATION_MASTER"
62+
}
63+
64+
function upload_worker_image() {
65+
echo "Tagging worker image $LOCAL_WORKER_IMAGE to $CLOUD_WORKER_IMAGE"
66+
docker tag $LOCAL_WORKER_IMAGE $CLOUD_WORKER_IMAGE
67+
echo "Pushing worker image $CLOUD_WORKER_IMAGE to GCR"
68+
docker push $CLOUD_WORKER_IMAGE
69+
}
70+
71+
function pull_worker_image() {
72+
echo "Pulling worker image $CLOUD_WORKER_IMAGE on workers $(gcloud compute instances list | sed "s/ .*//" | grep "^\($CLUSTER_NAME-m$\|$CLUSTER_NAME-w-[a-zA-Z0-9]*$\)")"
73+
gcloud compute instances list | sed "s/ .*//" | grep "^\($CLUSTER_NAME-m$\|$CLUSTER_NAME-w-[a-zA-Z0-9]*$\)" | xargs -I INSTANCE -P 100 gcloud compute ssh yarn@INSTANCE --command="docker pull $CLOUD_WORKER_IMAGE"
74+
}
75+
76+
function start_yarn_application() {
77+
echo "Starting yarn application on $MASTER_NAME"
78+
execute_on_master "/usr/lib/flink/bin/yarn-session.sh -n $NUM_WORKERS -tm $TASK_MANAGER_MEM -s $TASK_MANAGER_SLOTS -d -nm flink_yarn"
79+
}
80+
81+
function execute_on_master() {
82+
gcloud compute ssh yarn@$MASTER_NAME --command="$1"
83+
}
84+
85+
function upload_resources() {
86+
local TMP_FOLDER=`mktemp -d -t flink_tmp_XXXX`
87+
88+
echo "Downloading flink at $TMP_FOLDER"
89+
wget -P $TMP_FOLDER $FLINK_DOWNLOAD_URL
90+
91+
echo "Uploading resources to GCS $WORK_DIR"
92+
cp ./create_cluster.sh $TMP_FOLDER
93+
cp ./docker-init.sh $TMP_FOLDER
94+
cp ./flink-init-dataproc.sh $TMP_FOLDER
95+
96+
gsutil cp -r $TMP_FOLDER/* $WORK_DIR/flink
97+
98+
rm -r $TMP_FOLDER
99+
}
100+
101+
function start_tunnel() {
102+
local job_server_config=`execute_on_master "curl -s \"http://$YARN_APPLICATION_MASTER/jobmanager/config\""`
103+
local key="jobmanager.rpc.port"
104+
local yarn_application_master_host=`echo $YARN_APPLICATION_MASTER | cut -d ":" -f1`
105+
106+
jobmanager_rpc_port=`echo $job_server_config | python -c "import sys, json; print [ e['value'] for e in json.load(sys.stdin) if e['key'] == u'$key'][0]"`
107+
local tunnel_command="gcloud compute ssh $MASTER_NAME -- -L $FLINK_LOCAL_PORT:$YARN_APPLICATION_MASTER -L $jobmanager_rpc_port:$yarn_application_master_host:$jobmanager_rpc_port -D 1080"
108+
local kill_command="gcloud compute ssh yarn@$MASTER_NAME --command=\"yarn application -kill $YARN_APPLICATION\""
109+
echo "===================Closing the shell does not stop the yarn application==================="
110+
echo "Execute \"$kill_command\" to kill the yarn application."
111+
echo "Starting tunnel \"$tunnel_command\""
112+
echo "Exposing flink jobserver at localhost:$FLINK_LOCAL_PORT"
113+
gcloud compute ssh yarn@$MASTER_NAME -- -L $FLINK_LOCAL_PORT:$YARN_APPLICATION_MASTER -L $jobmanager_rpc_port:$yarn_application_master_host:$jobmanager_rpc_port -D 1080
114+
echo "===================Closing the shell does not stop the yarn application==================="
115+
echo "Execute \"$kill_command\" to kill the yarn application."
116+
echo "To re-establish tunnel use \"$tunnel_command\""
117+
}
118+
119+
function create_cluster() {
120+
echo "Starting dataproc cluster."
121+
gcloud dataproc clusters create $CLUSTER_NAME --num-workers=$NUM_WORKERS --initialization-actions $FLINK_INIT,$DOCKER_INIT --metadata flink_version=$FLINK_VERSION,work_dir=$WORK_DIR/flink --image-version=$DATAPROC_VERSION
122+
echo "Sleeping for 30 sec"
123+
sleep 30s
124+
}
125+
126+
function main() {
127+
upload_resources
128+
create_cluster # Comment this line to use existing cluster.
129+
start_yarn_application # Comment this line if yarn application is already running on the cluster.
130+
get_leader
131+
upload_worker_image
132+
pull_worker_image
133+
start_tunnel
134+
}
135+
136+
main "$@"

0 commit comments

Comments
 (0)