From 53be7bec599482594ceca930d56b87fbb1c36914 Mon Sep 17 00:00:00 2001
From: vishal <vishalbollu@users.noreply.github.com>
Date: Thu, 25 Apr 2019 11:32:50 -0400
Subject: [PATCH 1/4] Upgrade spark from 2.4.1 to 2.4.2

---
 docs/applications/implementations/aggregators.md  | 2 +-
 docs/applications/implementations/transformers.md | 2 +-
 docs/applications/resources/environments.md       | 2 +-
 images/spark-base/Dockerfile                      | 2 +-
 pkg/workloads/lib/package.py                      | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/applications/implementations/aggregators.md b/docs/applications/implementations/aggregators.md
index c329992227..611d264690 100644
--- a/docs/applications/implementations/aggregators.md
+++ b/docs/applications/implementations/aggregators.md
@@ -42,7 +42,7 @@ def aggregate_spark(data, columns, args):
 The following packages have been pre-installed and can be used in your implementations:
 
 ```text
-pyspark==2.4.1
+pyspark==2.4.2
 boto3==1.9.78
 msgpack==0.6.1
 numpy>=1.13.3,<2
diff --git a/docs/applications/implementations/transformers.md b/docs/applications/implementations/transformers.md
index 9c238b426f..64a4ca91af 100644
--- a/docs/applications/implementations/transformers.md
+++ b/docs/applications/implementations/transformers.md
@@ -86,7 +86,7 @@ def reverse_transform_python(transformed_value, args):
 The following packages have been pre-installed and can be used in your implementations:
 
 ```text
-pyspark==2.4.1
+pyspark==2.4.2
 boto3==1.9.78
 msgpack==0.6.1
 numpy>=1.13.3,<2
diff --git a/docs/applications/resources/environments.md b/docs/applications/resources/environments.md
index 8e9f656431..45acd8525a 100644
--- a/docs/applications/resources/environments.md
+++ b/docs/applications/resources/environments.md
@@ -35,7 +35,7 @@ data:
 
 #### CSV Config
 
-To help ingest different styles of CSV files, Cortex supports the parameters listed below. All of these parameters are optional. A description and default values for each parameter can be found in the [PySpark CSV Documentation](https://spark.apache.org/docs/2.4.1/api/python/pyspark.sql.html#pyspark.sql.DataFrameReader.csv).
+To help ingest different styles of CSV files, Cortex supports the parameters listed below. All of these parameters are optional. A description and default values for each parameter can be found in the [PySpark CSV Documentation](https://spark.apache.org/docs/2.4.2/api/python/pyspark.sql.html#pyspark.sql.DataFrameReader.csv).
 
 ```yaml
 csv_config:
diff --git a/images/spark-base/Dockerfile b/images/spark-base/Dockerfile
index dca5784f8f..5d454034b5 100644
--- a/images/spark-base/Dockerfile
+++ b/images/spark-base/Dockerfile
@@ -11,7 +11,7 @@ RUN apt-get update -qq && apt-get install -y -q \
 RUN mkdir -p /opt
 
 ARG HADOOP_VERSION="2.9.2"
-ARG SPARK_VERSION="2.4.1"
+ARG SPARK_VERSION="2.4.2"
 ARG TF_VERSION="1.12.0"
 # Check aws-java-sdk-bundle dependency version: https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws/$HADOOP_VERSION
 ARG AWS_JAVA_SDK_VERSION="1.11.199"
diff --git a/pkg/workloads/lib/package.py b/pkg/workloads/lib/package.py
index b7e228706f..c5eec7fca0 100644
--- a/pkg/workloads/lib/package.py
+++ b/pkg/workloads/lib/package.py
@@ -39,7 +39,7 @@ def get_build_order(python_packages):
 
 
 def get_restricted_packages():
-    req_list = ["pyspark==2.4.1", "tensorflow==1.12.0"]
+    req_list = ["pyspark==2.4.2", "tensorflow==1.12.0"]
     req_files = glob.glob("/src/**/requirements.txt", recursive=True)
 
     for req_file in req_files:

From aafd3873b9435388e740fe9f6e743ff63958fa93 Mon Sep 17 00:00:00 2001
From: vishal <vishalbollu@users.noreply.github.com>
Date: Thu, 25 Apr 2019 19:55:12 +0000
Subject: [PATCH 2/4] Update tensorflow connector to be compatible with spark
 2.4.2 built with scala 2.12

---
 images/spark-base/Dockerfile | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/images/spark-base/Dockerfile b/images/spark-base/Dockerfile
index 5d454034b5..90291630c6 100644
--- a/images/spark-base/Dockerfile
+++ b/images/spark-base/Dockerfile
@@ -29,12 +29,17 @@ RUN curl http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/h
 RUN curl http://www.us.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-without-hadoop.tgz | tar -zx && \
     mv spark-${SPARK_VERSION}-bin-without-hadoop $SPARK_HOME
 
+# Required for building tensorflow spark connector
+ARG SCALA_VERSION="2.12"
+# Scalatest version from https://github.com/apache/spark/blob/v2.4.2/pom.xml
+ARG SCALATEST_VERSION="3.0.3"
+
 # Tensorflow Spark connector
 RUN rm -rf ~/tf-ecosystem && git clone https://github.com/tensorflow/ecosystem.git ~/tf-ecosystem && \
     mvn -f ~/tf-ecosystem/hadoop/pom.xml versions:set -DnewVersion=${TF_VERSION} -q && \
     mvn -f ~/tf-ecosystem/hadoop/pom.xml -Dmaven.test.skip=true clean install -q && \
     mvn -f ~/tf-ecosystem/spark/spark-tensorflow-connector/pom.xml versions:set -DnewVersion=${TF_VERSION} -q && \
-    mvn -f ~/tf-ecosystem/spark/spark-tensorflow-connector/pom.xml -Dmaven.test.skip=true clean install -Dspark.version=${SPARK_VERSION} -q && \
+    mvn -f ~/tf-ecosystem/spark/spark-tensorflow-connector/pom.xml -Dmaven.test.skip=true clean install -Dspark.version=${SPARK_VERSION} -Dscala.binary.version=${SCALA_VERSION} -Dscala.test.version=${SCALATEST_VERSION} -q && \
     mv ~/tf-ecosystem/spark/spark-tensorflow-connector/target/spark-tensorflow-connector_2.11-${TF_VERSION}.jar $SPARK_HOME/jars/
 
 # Hadoop AWS

From a2ab7a5e4778da8e9f27ef67a95c2e2a0fee48ac Mon Sep 17 00:00:00 2001
From: vishal <vishalbollu@users.noreply.github.com>
Date: Thu, 25 Apr 2019 20:29:11 +0000
Subject: [PATCH 3/4] Address line length in Dockerfile

---
 images/spark-base/Dockerfile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/images/spark-base/Dockerfile b/images/spark-base/Dockerfile
index 90291630c6..1308053213 100644
--- a/images/spark-base/Dockerfile
+++ b/images/spark-base/Dockerfile
@@ -39,7 +39,8 @@ RUN rm -rf ~/tf-ecosystem && git clone https://github.com/tensorflow/ecosystem.g
     mvn -f ~/tf-ecosystem/hadoop/pom.xml versions:set -DnewVersion=${TF_VERSION} -q && \
     mvn -f ~/tf-ecosystem/hadoop/pom.xml -Dmaven.test.skip=true clean install -q && \
     mvn -f ~/tf-ecosystem/spark/spark-tensorflow-connector/pom.xml versions:set -DnewVersion=${TF_VERSION} -q && \
-    mvn -f ~/tf-ecosystem/spark/spark-tensorflow-connector/pom.xml -Dmaven.test.skip=true clean install -Dspark.version=${SPARK_VERSION} -Dscala.binary.version=${SCALA_VERSION} -Dscala.test.version=${SCALATEST_VERSION} -q && \
+    mvn -f ~/tf-ecosystem/spark/spark-tensorflow-connector/pom.xml -Dmaven.test.skip=true clean install \
+        -Dspark.version=${SPARK_VERSION} -Dscala.binary.version=${SCALA_VERSION} -Dscala.test.version=${SCALATEST_VERSION} -q && \
     mv ~/tf-ecosystem/spark/spark-tensorflow-connector/target/spark-tensorflow-connector_2.11-${TF_VERSION}.jar $SPARK_HOME/jars/
 
 # Hadoop AWS

From e99f4425399c5203ed9d0758b342564b561ba321 Mon Sep 17 00:00:00 2001
From: vishal <vishalbollu@users.noreply.github.com>
Date: Thu, 25 Apr 2019 20:35:48 +0000
Subject: [PATCH 4/4] Group args together

---
 images/spark-base/Dockerfile | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/images/spark-base/Dockerfile b/images/spark-base/Dockerfile
index 1308053213..9ff1cd3bf3 100644
--- a/images/spark-base/Dockerfile
+++ b/images/spark-base/Dockerfile
@@ -13,6 +13,10 @@ RUN mkdir -p /opt
 ARG HADOOP_VERSION="2.9.2"
 ARG SPARK_VERSION="2.4.2"
 ARG TF_VERSION="1.12.0"
+# Required for building tensorflow spark connector
+ARG SCALA_VERSION="2.12"
+# Scalatest version from https://github.com/apache/spark/blob/v2.4.2/pom.xml
+ARG SCALATEST_VERSION="3.0.3"
 # Check aws-java-sdk-bundle dependency version: https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws/$HADOOP_VERSION
 ARG AWS_JAVA_SDK_VERSION="1.11.199"
 
@@ -29,11 +33,6 @@ RUN curl http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/h
 RUN curl http://www.us.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-without-hadoop.tgz | tar -zx && \
     mv spark-${SPARK_VERSION}-bin-without-hadoop $SPARK_HOME
 
-# Required for building tensorflow spark connector
-ARG SCALA_VERSION="2.12"
-# Scalatest version from https://github.com/apache/spark/blob/v2.4.2/pom.xml
-ARG SCALATEST_VERSION="3.0.3"
-
 # Tensorflow Spark connector
 RUN rm -rf ~/tf-ecosystem && git clone https://github.com/tensorflow/ecosystem.git ~/tf-ecosystem && \
     mvn -f ~/tf-ecosystem/hadoop/pom.xml versions:set -DnewVersion=${TF_VERSION} -q && \