diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile index 1619b009e9364..10a39497c8ed9 100644 --- a/dev/infra/Dockerfile +++ b/dev/infra/Dockerfile @@ -24,7 +24,7 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image" # Overwrite this label to avoid exposing the underlying Ubuntu OS version label LABEL org.opencontainers.image.version="" -ENV FULL_REFRESH_DATE 20241002 +ENV FULL_REFRESH_DATE 20241007 ENV DEBIAN_FRONTEND noninteractive ENV DEBCONF_NONINTERACTIVE_SEEN true @@ -91,10 +91,10 @@ RUN mkdir -p /usr/local/pypy/pypy3.9 && \ ln -sf /usr/local/pypy/pypy3.9/bin/pypy /usr/local/bin/pypy3.9 && \ ln -sf /usr/local/pypy/pypy3.9/bin/pypy /usr/local/bin/pypy3 RUN curl -sS https://bootstrap.pypa.io/get-pip.py | pypy3 -RUN pypy3 -m pip install 'numpy==1.26.4' 'six==1.16.0' 'pandas==2.2.3' scipy coverage matplotlib lxml +RUN pypy3 -m pip install numpy 'six==1.16.0' 'pandas==2.2.3' scipy coverage matplotlib lxml -ARG BASIC_PIP_PKGS="numpy==1.26.4 pyarrow>=15.0.0 six==1.16.0 pandas==2.2.3 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2" +ARG BASIC_PIP_PKGS="numpy pyarrow>=15.0.0 six==1.16.0 pandas==2.2.3 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2" # Python deps for Spark Connect ARG CONNECT_PIP_PKGS="grpcio==1.62.0 grpcio-status==1.62.0 protobuf==4.25.1 googleapis-common-protos==1.56.4 graphviz==0.20.3" diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 937753b50bb13..b89755d9c18a5 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -699,7 +699,7 @@ class LinearSVC( >>> model_path = temp_path + "/svm_model" >>> model.save(model_path) >>> model2 = LinearSVCModel.load(model_path) - >>> model.coefficients[0] == model2.coefficients[0] + >>> bool(model.coefficients[0] == model2.coefficients[0]) True >>> model.intercept == model2.intercept True @@ -1210,7 +1210,7 @@ class LogisticRegression( >>> model_path = temp_path + "/lr_model" >>> blorModel.save(model_path) >>> model2 = LogisticRegressionModel.load(model_path) - >>> blorModel.coefficients[0] == model2.coefficients[0] + >>> bool(blorModel.coefficients[0] == model2.coefficients[0]) True >>> blorModel.intercept == model2.intercept True @@ -2038,9 +2038,9 @@ class RandomForestClassifier( >>> result = model.transform(test0).head() >>> result.prediction 0.0 - >>> numpy.argmax(result.probability) + >>> int(numpy.argmax(result.probability)) 0 - >>> numpy.argmax(result.newRawPrediction) + >>> int(numpy.argmax(result.newRawPrediction)) 0 >>> result.leafId DenseVector([0.0, 0.0, 0.0]) diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index d08e241b41d23..d7cc27e274279 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -266,7 +266,7 @@ class LinearRegression( True >>> abs(model.transform(test0).head().newPrediction - (-1.0)) < 0.001 True - >>> abs(model.coefficients[0] - 1.0) < 0.001 + >>> bool(abs(model.coefficients[0] - 1.0) < 0.001) True >>> abs(model.intercept - 0.0) < 0.001 True @@ -283,11 +283,11 @@ class LinearRegression( >>> model_path = temp_path + "/lr_model" >>> model.save(model_path) >>> model2 = LinearRegressionModel.load(model_path) - >>> model.coefficients[0] == model2.coefficients[0] + >>> bool(model.coefficients[0] == model2.coefficients[0]) True - >>> model.intercept == model2.intercept + >>> bool(model.intercept == model2.intercept) True - >>> model.transform(test0).take(1) == model2.transform(test0).take(1) + >>> bool(model.transform(test0).take(1) == model2.transform(test0).take(1)) True >>> model.numFeatures 1 @@ -2542,7 +2542,7 @@ class GeneralizedLinearRegression( >>> model2 = GeneralizedLinearRegressionModel.load(model_path) >>> model.intercept == model2.intercept True - >>> model.coefficients[0] == model2.coefficients[0] + >>> bool(model.coefficients[0] == model2.coefficients[0]) True >>> model.transform(df).take(1) == model2.transform(df).take(1) True diff --git a/python/pyspark/ml/tests/test_functions.py b/python/pyspark/ml/tests/test_functions.py index 7df0a26394140..e67e46ded67bd 100644 --- a/python/pyspark/ml/tests/test_functions.py +++ b/python/pyspark/ml/tests/test_functions.py @@ -18,6 +18,7 @@ import numpy as np +from pyspark.loose_version import LooseVersion from pyspark.ml.functions import predict_batch_udf from pyspark.sql.functions import array, struct, col from pyspark.sql.types import ArrayType, DoubleType, IntegerType, StructType, StructField, FloatType @@ -193,6 +194,10 @@ def predict(inputs): batch_sizes = preds["preds"].to_numpy() self.assertTrue(all(batch_sizes <= batch_size)) + # TODO(SPARK-49793): enable the test below + @unittest.skipIf( + LooseVersion(np.__version__) >= LooseVersion("2"), "Caching does not work with numpy 2" + ) def test_caching(self): def make_predict_fn(): # emulate loading a model, this should only be invoked once (per worker process) diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index e8713d81c4d62..888beff663523 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -706,7 +706,7 @@ class CrossValidator( >>> cvModel = cv.fit(dataset) >>> cvModel.getNumFolds() 3 - >>> cvModel.avgMetrics[0] + >>> float(cvModel.avgMetrics[0]) 0.5 >>> path = tempfile.mkdtemp() >>> model_path = path + "/model" diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py index 1e1795d9fb3d4..bf8fd04dc2837 100644 --- a/python/pyspark/mllib/classification.py +++ b/python/pyspark/mllib/classification.py @@ -172,9 +172,9 @@ class LogisticRegressionModel(LinearClassificationModel): >>> path = tempfile.mkdtemp() >>> lrm.save(sc, path) >>> sameModel = LogisticRegressionModel.load(sc, path) - >>> sameModel.predict(numpy.array([0.0, 1.0])) + >>> int(sameModel.predict(numpy.array([0.0, 1.0]))) 1 - >>> sameModel.predict(SparseVector(2, {0: 1.0})) + >>> int(sameModel.predict(SparseVector(2, {0: 1.0}))) 0 >>> from shutil import rmtree >>> try: @@ -555,7 +555,7 @@ class SVMModel(LinearClassificationModel): >>> svm.predict(sc.parallelize([[1.0]])).collect() [1] >>> svm.clearThreshold() - >>> svm.predict(numpy.array([1.0])) + >>> float(svm.predict(numpy.array([1.0]))) 1.44... >>> sparse_data = [ @@ -573,9 +573,9 @@ class SVMModel(LinearClassificationModel): >>> path = tempfile.mkdtemp() >>> svm.save(sc, path) >>> sameModel = SVMModel.load(sc, path) - >>> sameModel.predict(SparseVector(2, {1: 1.0})) + >>> int(sameModel.predict(SparseVector(2, {1: 1.0}))) 1 - >>> sameModel.predict(SparseVector(2, {0: -1.0})) + >>> int(sameModel.predict(SparseVector(2, {0: -1.0}))) 0 >>> from shutil import rmtree >>> try: @@ -756,11 +756,11 @@ class NaiveBayesModel(Saveable, Loader["NaiveBayesModel"]): ... LabeledPoint(1.0, [1.0, 0.0]), ... ] >>> model = NaiveBayes.train(sc.parallelize(data)) - >>> model.predict(numpy.array([0.0, 1.0])) + >>> float(model.predict(numpy.array([0.0, 1.0]))) 0.0 - >>> model.predict(numpy.array([1.0, 0.0])) + >>> float(model.predict(numpy.array([1.0, 0.0]))) 1.0 - >>> model.predict(sc.parallelize([[1.0, 0.0]])).collect() + >>> list(map(float, model.predict(sc.parallelize([[1.0, 0.0]])).collect())) [1.0] >>> sparse_data = [ ... LabeledPoint(0.0, SparseVector(2, {1: 0.0})), @@ -768,15 +768,18 @@ class NaiveBayesModel(Saveable, Loader["NaiveBayesModel"]): ... LabeledPoint(1.0, SparseVector(2, {0: 1.0})) ... ] >>> model = NaiveBayes.train(sc.parallelize(sparse_data)) - >>> model.predict(SparseVector(2, {1: 1.0})) + >>> float(model.predict(SparseVector(2, {1: 1.0}))) 0.0 - >>> model.predict(SparseVector(2, {0: 1.0})) + >>> float(model.predict(SparseVector(2, {0: 1.0}))) 1.0 >>> import os, tempfile >>> path = tempfile.mkdtemp() >>> model.save(sc, path) >>> sameModel = NaiveBayesModel.load(sc, path) - >>> sameModel.predict(SparseVector(2, {0: 1.0})) == model.predict(SparseVector(2, {0: 1.0})) + >>> bool(( + ... sameModel.predict(SparseVector(2, {0: 1.0})) == + ... model.predict(SparseVector(2, {0: 1.0})) + ... )) True >>> from shutil import rmtree >>> try: diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index 24884f4853371..915a55595cb53 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -554,9 +554,9 @@ class PCA: ... Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0])] >>> model = PCA(2).fit(sc.parallelize(data)) >>> pcArray = model.transform(Vectors.sparse(5, [(1, 1.0), (3, 7.0)])).toArray() - >>> pcArray[0] + >>> float(pcArray[0]) 1.648... - >>> pcArray[1] + >>> float(pcArray[1]) -4.013... """ diff --git a/python/pyspark/mllib/random.py b/python/pyspark/mllib/random.py index 80bbd717071dc..dbe1048a64b36 100644 --- a/python/pyspark/mllib/random.py +++ b/python/pyspark/mllib/random.py @@ -134,9 +134,9 @@ def normalRDD( >>> stats = x.stats() >>> stats.count() 1000 - >>> abs(stats.mean() - 0.0) < 0.1 + >>> bool(abs(stats.mean() - 0.0) < 0.1) True - >>> abs(stats.stdev() - 1.0) < 0.1 + >>> bool(abs(stats.stdev() - 1.0) < 0.1) True """ return callMLlibFunc("normalRDD", sc._jsc, size, numPartitions, seed) @@ -186,10 +186,10 @@ def logNormalRDD( >>> stats = x.stats() >>> stats.count() 1000 - >>> abs(stats.mean() - expMean) < 0.5 + >>> bool(abs(stats.mean() - expMean) < 0.5) True >>> from math import sqrt - >>> abs(stats.stdev() - expStd) < 0.5 + >>> bool(abs(stats.stdev() - expStd) < 0.5) True """ return callMLlibFunc( @@ -238,7 +238,7 @@ def poissonRDD( >>> abs(stats.mean() - mean) < 0.5 True >>> from math import sqrt - >>> abs(stats.stdev() - sqrt(mean)) < 0.5 + >>> bool(abs(stats.stdev() - sqrt(mean)) < 0.5) True """ return callMLlibFunc("poissonRDD", sc._jsc, float(mean), size, numPartitions, seed) @@ -285,7 +285,7 @@ def exponentialRDD( >>> abs(stats.mean() - mean) < 0.5 True >>> from math import sqrt - >>> abs(stats.stdev() - sqrt(mean)) < 0.5 + >>> bool(abs(stats.stdev() - sqrt(mean)) < 0.5) True """ return callMLlibFunc("exponentialRDD", sc._jsc, float(mean), size, numPartitions, seed) @@ -336,9 +336,9 @@ def gammaRDD( >>> stats = x.stats() >>> stats.count() 1000 - >>> abs(stats.mean() - expMean) < 0.5 + >>> bool(abs(stats.mean() - expMean) < 0.5) True - >>> abs(stats.stdev() - expStd) < 0.5 + >>> bool(abs(stats.stdev() - expStd) < 0.5) True """ return callMLlibFunc( @@ -384,7 +384,7 @@ def uniformVectorRDD( >>> mat = np.matrix(RandomRDDs.uniformVectorRDD(sc, 10, 10).collect()) >>> mat.shape (10, 10) - >>> mat.max() <= 1.0 and mat.min() >= 0.0 + >>> bool(mat.max() <= 1.0 and mat.min() >= 0.0) True >>> RandomRDDs.uniformVectorRDD(sc, 10, 10, 4).getNumPartitions() 4 @@ -430,9 +430,9 @@ def normalVectorRDD( >>> mat = np.matrix(RandomRDDs.normalVectorRDD(sc, 100, 100, seed=1).collect()) >>> mat.shape (100, 100) - >>> abs(mat.mean() - 0.0) < 0.1 + >>> bool(abs(mat.mean() - 0.0) < 0.1) True - >>> abs(mat.std() - 1.0) < 0.1 + >>> bool(abs(mat.std() - 1.0) < 0.1) True """ return callMLlibFunc("normalVectorRDD", sc._jsc, numRows, numCols, numPartitions, seed) @@ -488,9 +488,9 @@ def logNormalVectorRDD( >>> mat = np.matrix(m) >>> mat.shape (100, 100) - >>> abs(mat.mean() - expMean) < 0.1 + >>> bool(abs(mat.mean() - expMean) < 0.1) True - >>> abs(mat.std() - expStd) < 0.1 + >>> bool(abs(mat.std() - expStd) < 0.1) True """ return callMLlibFunc( @@ -545,13 +545,13 @@ def poissonVectorRDD( >>> import numpy as np >>> mean = 100.0 >>> rdd = RandomRDDs.poissonVectorRDD(sc, mean, 100, 100, seed=1) - >>> mat = np.mat(rdd.collect()) + >>> mat = np.asmatrix(rdd.collect()) >>> mat.shape (100, 100) - >>> abs(mat.mean() - mean) < 0.5 + >>> bool(abs(mat.mean() - mean) < 0.5) True >>> from math import sqrt - >>> abs(mat.std() - sqrt(mean)) < 0.5 + >>> bool(abs(mat.std() - sqrt(mean)) < 0.5) True """ return callMLlibFunc( @@ -599,13 +599,13 @@ def exponentialVectorRDD( >>> import numpy as np >>> mean = 0.5 >>> rdd = RandomRDDs.exponentialVectorRDD(sc, mean, 100, 100, seed=1) - >>> mat = np.mat(rdd.collect()) + >>> mat = np.asmatrix(rdd.collect()) >>> mat.shape (100, 100) - >>> abs(mat.mean() - mean) < 0.5 + >>> bool(abs(mat.mean() - mean) < 0.5) True >>> from math import sqrt - >>> abs(mat.std() - sqrt(mean)) < 0.5 + >>> bool(abs(mat.std() - sqrt(mean)) < 0.5) True """ return callMLlibFunc( @@ -662,9 +662,9 @@ def gammaVectorRDD( >>> mat = np.matrix(RandomRDDs.gammaVectorRDD(sc, shape, scale, 100, 100, seed=1).collect()) >>> mat.shape (100, 100) - >>> abs(mat.mean() - expMean) < 0.1 + >>> bool(abs(mat.mean() - expMean) < 0.1) True - >>> abs(mat.std() - expStd) < 0.1 + >>> bool(abs(mat.std() - expStd) < 0.1) True """ return callMLlibFunc( diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py index f1003327912d0..87f05bc0979b8 100644 --- a/python/pyspark/mllib/regression.py +++ b/python/pyspark/mllib/regression.py @@ -144,9 +144,9 @@ class LinearRegressionModelBase(LinearModel): -------- >>> from pyspark.mllib.linalg import SparseVector >>> lrmb = LinearRegressionModelBase(np.array([1.0, 2.0]), 0.1) - >>> abs(lrmb.predict(np.array([-1.03, 7.777])) - 14.624) < 1e-6 + >>> bool(abs(lrmb.predict(np.array([-1.03, 7.777])) - 14.624) < 1e-6) True - >>> abs(lrmb.predict(SparseVector(2, {0: -1.03, 1: 7.777})) - 14.624) < 1e-6 + >>> bool(abs(lrmb.predict(SparseVector(2, {0: -1.03, 1: 7.777})) - 14.624) < 1e-6) True """ @@ -190,23 +190,23 @@ class LinearRegressionModel(LinearRegressionModelBase): ... ] >>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10, ... initialWeights=np.array([1.0])) - >>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5 + >>> bool(abs(lrm.predict(np.array([0.0])) - 0) < 0.5) True - >>> abs(lrm.predict(np.array([1.0])) - 1) < 0.5 + >>> bool(abs(lrm.predict(np.array([1.0])) - 1) < 0.5) True - >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 + >>> bool(abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5) True - >>> abs(lrm.predict(sc.parallelize([[1.0]])).collect()[0] - 1) < 0.5 + >>> bool(abs(lrm.predict(sc.parallelize([[1.0]])).collect()[0] - 1) < 0.5) True >>> import os, tempfile >>> path = tempfile.mkdtemp() >>> lrm.save(sc, path) >>> sameModel = LinearRegressionModel.load(sc, path) - >>> abs(sameModel.predict(np.array([0.0])) - 0) < 0.5 + >>> bool(abs(sameModel.predict(np.array([0.0])) - 0) < 0.5) True - >>> abs(sameModel.predict(np.array([1.0])) - 1) < 0.5 + >>> bool(abs(sameModel.predict(np.array([1.0])) - 1) < 0.5) True - >>> abs(sameModel.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 + >>> bool(abs(sameModel.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5) True >>> from shutil import rmtree >>> try: @@ -221,16 +221,16 @@ class LinearRegressionModel(LinearRegressionModelBase): ... ] >>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10, ... initialWeights=np.array([1.0])) - >>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5 + >>> bool(abs(lrm.predict(np.array([0.0])) - 0) < 0.5) True - >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 + >>> bool(abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5) True >>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10, step=1.0, ... miniBatchFraction=1.0, initialWeights=np.array([1.0]), regParam=0.1, regType="l2", ... intercept=True, validateData=True) - >>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5 + >>> bool(abs(lrm.predict(np.array([0.0])) - 0) < 0.5) True - >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 + >>> bool(abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5) True """ @@ -402,23 +402,23 @@ class LassoModel(LinearRegressionModelBase): ... ] >>> lrm = LassoWithSGD.train( ... sc.parallelize(data), iterations=10, initialWeights=np.array([1.0])) - >>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5 + >>> bool(abs(lrm.predict(np.array([0.0])) - 0) < 0.5) True - >>> abs(lrm.predict(np.array([1.0])) - 1) < 0.5 + >>> bool(abs(lrm.predict(np.array([1.0])) - 1) < 0.5) True - >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 + >>> bool(abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5) True - >>> abs(lrm.predict(sc.parallelize([[1.0]])).collect()[0] - 1) < 0.5 + >>> bool(abs(lrm.predict(sc.parallelize([[1.0]])).collect()[0] - 1) < 0.5) True >>> import os, tempfile >>> path = tempfile.mkdtemp() >>> lrm.save(sc, path) >>> sameModel = LassoModel.load(sc, path) - >>> abs(sameModel.predict(np.array([0.0])) - 0) < 0.5 + >>> bool(abs(sameModel.predict(np.array([0.0])) - 0) < 0.5) True - >>> abs(sameModel.predict(np.array([1.0])) - 1) < 0.5 + >>> bool(abs(sameModel.predict(np.array([1.0])) - 1) < 0.5) True - >>> abs(sameModel.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 + >>> bool(abs(sameModel.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5) True >>> from shutil import rmtree >>> try: @@ -433,16 +433,16 @@ class LassoModel(LinearRegressionModelBase): ... ] >>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10, ... initialWeights=np.array([1.0])) - >>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5 + >>> bool(abs(lrm.predict(np.array([0.0])) - 0) < 0.5) True - >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 + >>> bool(abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5) True >>> lrm = LassoWithSGD.train(sc.parallelize(data), iterations=10, step=1.0, ... regParam=0.01, miniBatchFraction=1.0, initialWeights=np.array([1.0]), intercept=True, ... validateData=True) - >>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5 + >>> bool(abs(lrm.predict(np.array([0.0])) - 0) < 0.5) True - >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 + >>> bool(abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5) True """ @@ -580,23 +580,23 @@ class RidgeRegressionModel(LinearRegressionModelBase): ... ] >>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data), iterations=10, ... initialWeights=np.array([1.0])) - >>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5 + >>> bool(abs(lrm.predict(np.array([0.0])) - 0) < 0.5) True - >>> abs(lrm.predict(np.array([1.0])) - 1) < 0.5 + >>> bool(abs(lrm.predict(np.array([1.0])) - 1) < 0.5) True - >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 + >>> bool(abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5) True - >>> abs(lrm.predict(sc.parallelize([[1.0]])).collect()[0] - 1) < 0.5 + >>> bool(abs(lrm.predict(sc.parallelize([[1.0]])).collect()[0] - 1) < 0.5) True >>> import os, tempfile >>> path = tempfile.mkdtemp() >>> lrm.save(sc, path) >>> sameModel = RidgeRegressionModel.load(sc, path) - >>> abs(sameModel.predict(np.array([0.0])) - 0) < 0.5 + >>> bool(abs(sameModel.predict(np.array([0.0])) - 0) < 0.5) True - >>> abs(sameModel.predict(np.array([1.0])) - 1) < 0.5 + >>> bool(abs(sameModel.predict(np.array([1.0])) - 1) < 0.5) True - >>> abs(sameModel.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 + >>> bool(abs(sameModel.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5) True >>> from shutil import rmtree >>> try: @@ -611,16 +611,16 @@ class RidgeRegressionModel(LinearRegressionModelBase): ... ] >>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10, ... initialWeights=np.array([1.0])) - >>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5 + >>> bool(abs(lrm.predict(np.array([0.0])) - 0) < 0.5) True - >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 + >>> bool(abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5) True >>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data), iterations=10, step=1.0, ... regParam=0.01, miniBatchFraction=1.0, initialWeights=np.array([1.0]), intercept=True, ... validateData=True) - >>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5 + >>> bool(abs(lrm.predict(np.array([0.0])) - 0) < 0.5) True - >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 + >>> bool(abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5) True """ @@ -764,19 +764,19 @@ class IsotonicRegressionModel(Saveable, Loader["IsotonicRegressionModel"]): -------- >>> data = [(1, 0, 1), (2, 1, 1), (3, 2, 1), (1, 3, 1), (6, 4, 1), (17, 5, 1), (16, 6, 1)] >>> irm = IsotonicRegression.train(sc.parallelize(data)) - >>> irm.predict(3) + >>> float(irm.predict(3)) 2.0 - >>> irm.predict(5) + >>> float(irm.predict(5)) 16.5 - >>> irm.predict(sc.parallelize([3, 5])).collect() + >>> list(map(float, irm.predict(sc.parallelize([3, 5])).collect())) [2.0, 16.5] >>> import os, tempfile >>> path = tempfile.mkdtemp() >>> irm.save(sc, path) >>> sameModel = IsotonicRegressionModel.load(sc, path) - >>> sameModel.predict(3) + >>> float(sameModel.predict(3)) 2.0 - >>> sameModel.predict(5) + >>> float(sameModel.predict(5)) 16.5 >>> from shutil import rmtree >>> try: diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py index 6e63cff1d37b9..55f15fd2eb1a2 100644 --- a/python/pyspark/pandas/generic.py +++ b/python/pyspark/pandas/generic.py @@ -2631,7 +2631,7 @@ def first_valid_index(self) -> Optional[Union[Scalar, Tuple[Scalar, ...]]]: 500 5.0 dtype: float64 - >>> s.first_valid_index() + >>> int(s.first_valid_index()) 300 Support for MultiIndex @@ -2950,7 +2950,7 @@ def get(self, key: Any, default: Optional[Any] = None) -> Any: 20 1 b 20 2 b - >>> df.x.get(10) + >>> int(df.x.get(10)) 0 >>> df.x.get(20) @@ -3008,7 +3008,7 @@ def squeeze(self, axis: Optional[Axis] = None) -> Union[Scalar, "DataFrame", "Se 0 2 dtype: int64 - >>> even_primes.squeeze() + >>> int(even_primes.squeeze()) 2 Squeezing objects with more than one value in every axis does nothing: @@ -3066,7 +3066,7 @@ def squeeze(self, axis: Optional[Axis] = None) -> Union[Scalar, "DataFrame", "Se Squeezing all axes will project directly into a scalar: - >>> df_1a.squeeze() + >>> int(df_1a.squeeze()) 3 """ if axis is not None: diff --git a/python/pyspark/pandas/indexing.py b/python/pyspark/pandas/indexing.py index b5bf65a4907b7..c93366a31e315 100644 --- a/python/pyspark/pandas/indexing.py +++ b/python/pyspark/pandas/indexing.py @@ -122,7 +122,7 @@ class AtIndexer(IndexerLike): Get value at specified row/column pair - >>> psdf.at[4, 'B'] + >>> int(psdf.at[4, 'B']) 2 Get array if an index occurs multiple times @@ -202,7 +202,7 @@ class iAtIndexer(IndexerLike): Get value at specified row/column pair - >>> df.iat[1, 2] + >>> int(df.iat[1, 2]) 1 Get value within a series @@ -214,7 +214,7 @@ class iAtIndexer(IndexerLike): 30 3 dtype: int64 - >>> psser.iat[1] + >>> int(psser.iat[1]) 2 """ @@ -853,7 +853,7 @@ class LocIndexer(LocIndexerLike): Single label for column. - >>> df.loc['cobra', 'shield'] + >>> int(df.loc['cobra', 'shield']) 2 List of labels for row. diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py index ff941b692f95f..7e276860fbab1 100644 --- a/python/pyspark/pandas/series.py +++ b/python/pyspark/pandas/series.py @@ -4558,7 +4558,7 @@ def pop(self, item: Name) -> Union["Series", Scalar]: C 2 dtype: int64 - >>> s.pop('A') + >>> int(s.pop('A')) 0 >>> s @@ -5821,7 +5821,7 @@ def asof(self, where: Union[Any, List]) -> Union[Scalar, "Series"]: A scalar `where`. - >>> s.asof(20) + >>> float(s.asof(20)) 2.0 For a sequence `where`, a Series is returned. The first value is @@ -5836,12 +5836,12 @@ def asof(self, where: Union[Any, List]) -> Union[Scalar, "Series"]: Missing values are not considered. The following is ``2.0``, not NaN, even though NaN is at the index location for ``30``. - >>> s.asof(30) + >>> float(s.asof(30)) 2.0 >>> s = ps.Series([1, 2, np.nan, 4], index=[10, 30, 20, 40]) >>> with ps.option_context("compute.eager_check", False): - ... s.asof(20) + ... float(s.asof(20)) ... 1.0 """