chanedwin
diff --git a/‎.travis.yml
Lines changed: 97 additions & 0 deletions b/‎.travis.yml
Lines changed: 97 additions & 0 deletions
diff --git a/‎Makefile
Lines changed: 9 additions & 0 deletions b/‎Makefile
Lines changed: 9 additions & 0 deletions
diff --git a/‎pyproject.toml
Lines changed: 9 additions & 0 deletions b/‎pyproject.toml
Lines changed: 9 additions & 0 deletions
diff --git a/‎requirements-spark.txt
Lines changed: 5 additions & 0 deletions b/‎requirements-spark.txt
Lines changed: 5 additions & 0 deletions
diff --git a/‎requirements-test.txt
Lines changed: 13 additions & 3 deletions b/‎requirements-test.txt
Lines changed: 13 additions & 3 deletions
diff --git a/‎requirements.txt
Lines changed: 3 additions & 1 deletion b/‎requirements.txt
Lines changed: 3 additions & 1 deletion
diff --git a/‎setup.py
Lines changed: 3 additions & 0 deletions b/‎setup.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/pandas_profiling/model/correlations.py
Lines changed: 10 additions & 0 deletions b/‎src/pandas_profiling/model/correlations.py
Lines changed: 10 additions & 0 deletions
@@ -0,0 +1,97 @@
+os: linux
+dist: bionic
+language: python
+cache:
+  pip: true
+  directories:
+    - data/
+
+jobs:
+  include:
+    - os: linux
+      name: "Python 3.9-dev on Linux"
+      python: 3.9-dev
+      env: TEST=examples PANDAS=">=1"
+      before_install:
+        - sudo apt-get -y install libopenblas-dev
+
+  allow_failures:
+    - name: "Python 3.9-dev on Linux"
+    - env: TEST=spark PANDAS=">=1.1" SPARK_VERSION=2.4.7 HADOOP_VERSION=2.7
+      python: 3.8
+    - env: TEST=spark PANDAS=">=1.1" SPARK_VERSION=2.3.0 HADOOP_VERSION=2.7
+      python: 3.8
+
+python:
+  - 3.6
+  - 3.7
+  - 3.8
+
+env:
+  global:
+    - JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
+  jobs:
+    - TEST=unit PANDAS="<1"
+    - TEST=issue PANDAS="<1"
+    - TEST=console PANDAS="<1"
+    - TEST=examples PANDAS="<1"
+    - TEST=unit PANDAS="==1.0.5"
+    - TEST=issue PANDAS="==1.0.5"
+    - TEST=unit PANDAS=">=1.1"
+    - TEST=issue PANDAS=">=1.1"
+    - TEST=console PANDAS=">=1.1"
+    - TEST=examples PANDAS=">=1.1"
+    - TEST=lint PANDAS=">=1.1"
+    - TEST=typing PANDAS=">=1.1"
+    - TEST=spark PANDAS=">=1.1" SPARK_VERSION=2.3.0 HADOOP_VERSION=2.7
+    - TEST=spark PANDAS=">=1.1" SPARK_VERSION=2.4.7 HADOOP_VERSION=2.7
+    - TEST=spark PANDAS=">=1.1" SPARK_VERSION=3.0.1 HADOOP_VERSION=2.7
+
+before_install:
+  - pip install --upgrade pip setuptools wheel
+  - pip install -r requirements.txt
+  - pip install -r requirements-test.txt
+  - pip install "pandas$PANDAS"
+  - sudo apt-get -y install curl
+
+install:
+  - check-manifest
+  - python setup.py sdist bdist_wheel
+  - twine check dist/*
+  - pip install -e .[notebook,app]
+
+script:
+  - >
+    if [ $TEST == 'unit' ];
+    then pytest -m "not sparktest" --cov=. tests/unit/;
+    fi
+  - >
+    if [ $TEST == 'issue' ];
+    then pytest --cov=. tests/issues/;
+    fi
+  - >
+    if [ $TEST == 'examples' ];
+    then pytest --cov=. --nbval tests/notebooks/;
+    fi
+  - >
+    if [ $TEST == 'console' ];
+    then pandas_profiling -h;
+    fi
+  - >
+    if [ $TEST == 'typing' ];
+    then make typing;
+    fi
+  - >
+    if [ $TEST == 'lint' ];
+    then python -m black --check --diff --quiet .;
+    isort --check-only --profile black .;
+    flake8 . --select=E9,F63,F7,F82 --show-source --statistics;
+    fi
+  - >
+    if [ $TEST == 'spark' ];
+    then SPARK_VERSION=${SPARK_VERSION} HADOOP_VERSION=${HADOOP_VERSION} make install-spark-ci;
+    JAVA_HOME=${JAVA_HOME} SPARK_HOME=${TRAVIS_BUILD_DIR}/spark/ make test-spark;
+    fi
+
+after_success:
+  - codecov -F $TEST
@@ -24,6 +24,9 @@ test_cov:
 	pandas_profiling -h
 	make typing
 
+test-spark:
+	pytest -m sparktest --black tests/unit/
+
 examples:
 	find ./examples -maxdepth 2 -type f -name "*.py" -execdir python {} \;
 
@@ -37,6 +40,12 @@ pypi_package:
 install:
 	pip install -e .[notebook]
 
+install-spark-ci:
+	sudo apt-get -y install openjdk-8-jdk
+	curl https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
+	--output ${TRAVIS_BUILD_DIR}/spark.tgz
+	tar -xvzf ${TRAVIS_BUILD_DIR}/spark.tgz && mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark
+
 lint:
 	pre-commit run --all-files
 
 
@@ -0,0 +1,9 @@
+[tool.pytest.ini_options]
+markers =  ["sparktest",]
+[tool.pytest.ini_options.spark_options]
+"spark.executor.id" = "driver"
+"spark.app.name" = "PySparkShell"
+"spark.executor.instances" = 1
+"master" = "local[*]"
+"spark.driver.host" = "192.168.1.78"
+"spark.sql.catalogImplementation" = "in-memory"
@@ -0,0 +1,5 @@
+# this provides the recommended pyspark and pyarrow versions for spark to work on pandas-profiling
+# note that if you are using pyspark 2.3 or 2.4 and pyarrow >= 0.15, you might need to 
+# set ARROW_PRE_0_15_IPC_FORMAT=1 in your conf/spark-env.sh for toPandas functions to work properly
+pyspark>=2.3.0
+pyarrow>=0.8.0
@@ -1,11 +1,21 @@
 pytest
 coverage<5
 codecov
-pytest-mypy
+pytest-mypy>=0.7.0
+
+# this is because mypy had an issue where singledispatch _ usage resulted in errors
+# https://github.com/python/mypy/issues/4117
+mypy>=0.761
+
 pytest-cov
+pytest-black
 nbval
 fastparquet==0.4.1
 flake8
-check-manifest>=0.41
+check-manifest>=0.42
 twine>=3.1.1
-kaggle
+kaggle
+
+# spark dependency
+pytest-spark>=0.6.0
+pyarrow>=0.8.0
@@ -22,4 +22,6 @@ requests>=2.24.0
 tqdm>=4.48.2
 # Jupyter notebook
 ipywidgets>=7.5.1
-seaborn>=0.10.1
+seaborn>=0.10.1
+# Single dispatch lib
+singledispatchmethod>=1.0.0
@@ -42,6 +42,9 @@
     package_data={
         "pandas_profiling": ["py.typed"],
     },
+    package_data={
+        "pandas_profiling": ["py.typed"],
+    },
     include_package_data=True,
     classifiers=[
         "Development Status :: 5 - Production/Stable",
 
@@ -7,10 +7,14 @@
 import pandas as pd
 from pandas.core.base import DataError
 from scipy import stats
+from singledispatchmethod import singledispatchmethod
 
 from pandas_profiling.config import config
 from pandas_profiling.model.typeset import Boolean, Categorical, Numeric, Unsupported
 
+        Args:
+            df:
+            summary:
 
 class Correlation:
     @staticmethod
@@ -35,6 +39,7 @@ class Kendall(Correlation):
     def compute(df, summary) -> Optional[pd.DataFrame]:
         return df.corr(method="kendall")
 
+        """
 
 class Cramers(Correlation):
     @staticmethod
@@ -128,6 +133,11 @@ def compute(df, summary) -> Optional[pd.DataFrame]:
 
         return correlation
 
+    @compute.register(SparkDataFrame)
+    @staticmethod
+    def _compute_spark(df: SparkDataFrame, summary) -> Optional[pd.DataFrame]:
+        """
+        Use pandasUDF to compute this first, but probably can be optimised further
 
 def warn_correlation(correlation_name: str, error):
     warnings.warn(