From 53be7bec599482594ceca930d56b87fbb1c36914 Mon Sep 17 00:00:00 2001 From: vishal Date: Thu, 25 Apr 2019 11:32:50 -0400 Subject: [PATCH 1/4] Upgrade spark from 2.4.1 to 2.4.2 --- docs/applications/implementations/aggregators.md | 2 +- docs/applications/implementations/transformers.md | 2 +- docs/applications/resources/environments.md | 2 +- images/spark-base/Dockerfile | 2 +- pkg/workloads/lib/package.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/applications/implementations/aggregators.md b/docs/applications/implementations/aggregators.md index c329992227..611d264690 100644 --- a/docs/applications/implementations/aggregators.md +++ b/docs/applications/implementations/aggregators.md @@ -42,7 +42,7 @@ def aggregate_spark(data, columns, args): The following packages have been pre-installed and can be used in your implementations: ```text -pyspark==2.4.1 +pyspark==2.4.2 boto3==1.9.78 msgpack==0.6.1 numpy>=1.13.3,<2 diff --git a/docs/applications/implementations/transformers.md b/docs/applications/implementations/transformers.md index 9c238b426f..64a4ca91af 100644 --- a/docs/applications/implementations/transformers.md +++ b/docs/applications/implementations/transformers.md @@ -86,7 +86,7 @@ def reverse_transform_python(transformed_value, args): The following packages have been pre-installed and can be used in your implementations: ```text -pyspark==2.4.1 +pyspark==2.4.2 boto3==1.9.78 msgpack==0.6.1 numpy>=1.13.3,<2 diff --git a/docs/applications/resources/environments.md b/docs/applications/resources/environments.md index 8e9f656431..45acd8525a 100644 --- a/docs/applications/resources/environments.md +++ b/docs/applications/resources/environments.md @@ -35,7 +35,7 @@ data: #### CSV Config -To help ingest different styles of CSV files, Cortex supports the parameters listed below. All of these parameters are optional. A description and default values for each parameter can be found in the [PySpark CSV Documentation](https://spark.apache.org/docs/2.4.1/api/python/pyspark.sql.html#pyspark.sql.DataFrameReader.csv). +To help ingest different styles of CSV files, Cortex supports the parameters listed below. All of these parameters are optional. A description and default values for each parameter can be found in the [PySpark CSV Documentation](https://spark.apache.org/docs/2.4.2/api/python/pyspark.sql.html#pyspark.sql.DataFrameReader.csv). ```yaml csv_config: diff --git a/images/spark-base/Dockerfile b/images/spark-base/Dockerfile index dca5784f8f..5d454034b5 100644 --- a/images/spark-base/Dockerfile +++ b/images/spark-base/Dockerfile @@ -11,7 +11,7 @@ RUN apt-get update -qq && apt-get install -y -q \ RUN mkdir -p /opt ARG HADOOP_VERSION="2.9.2" -ARG SPARK_VERSION="2.4.1" +ARG SPARK_VERSION="2.4.2" ARG TF_VERSION="1.12.0" # Check aws-java-sdk-bundle dependency version: https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws/$HADOOP_VERSION ARG AWS_JAVA_SDK_VERSION="1.11.199" diff --git a/pkg/workloads/lib/package.py b/pkg/workloads/lib/package.py index b7e228706f..c5eec7fca0 100644 --- a/pkg/workloads/lib/package.py +++ b/pkg/workloads/lib/package.py @@ -39,7 +39,7 @@ def get_build_order(python_packages): def get_restricted_packages(): - req_list = ["pyspark==2.4.1", "tensorflow==1.12.0"] + req_list = ["pyspark==2.4.2", "tensorflow==1.12.0"] req_files = glob.glob("/src/**/requirements.txt", recursive=True) for req_file in req_files: From aafd3873b9435388e740fe9f6e743ff63958fa93 Mon Sep 17 00:00:00 2001 From: vishal Date: Thu, 25 Apr 2019 19:55:12 +0000 Subject: [PATCH 2/4] Update tensorflow connector to be compatible with spark 2.4.2 built with scala 2.12 --- images/spark-base/Dockerfile | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/images/spark-base/Dockerfile b/images/spark-base/Dockerfile index 5d454034b5..90291630c6 100644 --- a/images/spark-base/Dockerfile +++ b/images/spark-base/Dockerfile @@ -29,12 +29,17 @@ RUN curl http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/h RUN curl http://www.us.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-without-hadoop.tgz | tar -zx && \ mv spark-${SPARK_VERSION}-bin-without-hadoop $SPARK_HOME +# Required for building tensorflow spark connector +ARG SCALA_VERSION="2.12" +# Scalatest version from https://github.com/apache/spark/blob/v2.4.2/pom.xml +ARG SCALATEST_VERSION="3.0.3" + # Tensorflow Spark connector RUN rm -rf ~/tf-ecosystem && git clone https://github.com/tensorflow/ecosystem.git ~/tf-ecosystem && \ mvn -f ~/tf-ecosystem/hadoop/pom.xml versions:set -DnewVersion=${TF_VERSION} -q && \ mvn -f ~/tf-ecosystem/hadoop/pom.xml -Dmaven.test.skip=true clean install -q && \ mvn -f ~/tf-ecosystem/spark/spark-tensorflow-connector/pom.xml versions:set -DnewVersion=${TF_VERSION} -q && \ - mvn -f ~/tf-ecosystem/spark/spark-tensorflow-connector/pom.xml -Dmaven.test.skip=true clean install -Dspark.version=${SPARK_VERSION} -q && \ + mvn -f ~/tf-ecosystem/spark/spark-tensorflow-connector/pom.xml -Dmaven.test.skip=true clean install -Dspark.version=${SPARK_VERSION} -Dscala.binary.version=${SCALA_VERSION} -Dscala.test.version=${SCALATEST_VERSION} -q && \ mv ~/tf-ecosystem/spark/spark-tensorflow-connector/target/spark-tensorflow-connector_2.11-${TF_VERSION}.jar $SPARK_HOME/jars/ # Hadoop AWS From a2ab7a5e4778da8e9f27ef67a95c2e2a0fee48ac Mon Sep 17 00:00:00 2001 From: vishal Date: Thu, 25 Apr 2019 20:29:11 +0000 Subject: [PATCH 3/4] Address line length in Dockerfile --- images/spark-base/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/images/spark-base/Dockerfile b/images/spark-base/Dockerfile index 90291630c6..1308053213 100644 --- a/images/spark-base/Dockerfile +++ b/images/spark-base/Dockerfile @@ -39,7 +39,8 @@ RUN rm -rf ~/tf-ecosystem && git clone https://github.com/tensorflow/ecosystem.g mvn -f ~/tf-ecosystem/hadoop/pom.xml versions:set -DnewVersion=${TF_VERSION} -q && \ mvn -f ~/tf-ecosystem/hadoop/pom.xml -Dmaven.test.skip=true clean install -q && \ mvn -f ~/tf-ecosystem/spark/spark-tensorflow-connector/pom.xml versions:set -DnewVersion=${TF_VERSION} -q && \ - mvn -f ~/tf-ecosystem/spark/spark-tensorflow-connector/pom.xml -Dmaven.test.skip=true clean install -Dspark.version=${SPARK_VERSION} -Dscala.binary.version=${SCALA_VERSION} -Dscala.test.version=${SCALATEST_VERSION} -q && \ + mvn -f ~/tf-ecosystem/spark/spark-tensorflow-connector/pom.xml -Dmaven.test.skip=true clean install \ + -Dspark.version=${SPARK_VERSION} -Dscala.binary.version=${SCALA_VERSION} -Dscala.test.version=${SCALATEST_VERSION} -q && \ mv ~/tf-ecosystem/spark/spark-tensorflow-connector/target/spark-tensorflow-connector_2.11-${TF_VERSION}.jar $SPARK_HOME/jars/ # Hadoop AWS From e99f4425399c5203ed9d0758b342564b561ba321 Mon Sep 17 00:00:00 2001 From: vishal Date: Thu, 25 Apr 2019 20:35:48 +0000 Subject: [PATCH 4/4] Group args together --- images/spark-base/Dockerfile | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/images/spark-base/Dockerfile b/images/spark-base/Dockerfile index 1308053213..9ff1cd3bf3 100644 --- a/images/spark-base/Dockerfile +++ b/images/spark-base/Dockerfile @@ -13,6 +13,10 @@ RUN mkdir -p /opt ARG HADOOP_VERSION="2.9.2" ARG SPARK_VERSION="2.4.2" ARG TF_VERSION="1.12.0" +# Required for building tensorflow spark connector +ARG SCALA_VERSION="2.12" +# Scalatest version from https://github.com/apache/spark/blob/v2.4.2/pom.xml +ARG SCALATEST_VERSION="3.0.3" # Check aws-java-sdk-bundle dependency version: https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws/$HADOOP_VERSION ARG AWS_JAVA_SDK_VERSION="1.11.199" @@ -29,11 +33,6 @@ RUN curl http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/h RUN curl http://www.us.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-without-hadoop.tgz | tar -zx && \ mv spark-${SPARK_VERSION}-bin-without-hadoop $SPARK_HOME -# Required for building tensorflow spark connector -ARG SCALA_VERSION="2.12" -# Scalatest version from https://github.com/apache/spark/blob/v2.4.2/pom.xml -ARG SCALATEST_VERSION="3.0.3" - # Tensorflow Spark connector RUN rm -rf ~/tf-ecosystem && git clone https://github.com/tensorflow/ecosystem.git ~/tf-ecosystem && \ mvn -f ~/tf-ecosystem/hadoop/pom.xml versions:set -DnewVersion=${TF_VERSION} -q && \