Covariance (#2)

LukeMathWalker · web-flow · commit d75844e493d4 · 2018-09-21T09:10:42.000+02:00
* Added new file.

* Added correlation module to lib.rs

* Added stub for covariance method.

* Implement signature for covariance, alongside first failing test.

* Added one test for panic if working on a 1d array.

* Check number of dimensions before proceeding. Panic if invalid.

* First implementation of covariance for 2-dimensional arrays.

* Improved test using all_close.

* Added one test with a random array.

* Added a new test to check covariance_matrix is symmetric.

* added another test to check for panic when passing an invalid ddof.

* Using quickcheck to test the symmetry property for covariance matrices.

* Moved constant matrix test under quickcheck to generalize on the rconstant value.

* Added docs to cov. Published the CorrelationExt crate.

* Added another reason for panic to the docs.

* Added one more test and one more reason to panic to the docs.

* Added one more test for a badly conditioned array.
diff --git a/Cargo.toml b/Cargo.toml
@@ -11,6 +11,7 @@ rand = "0.5"
 
 [dev-dependencies]
 quickcheck = "0.7"
+ndarray-rand = "0.8"
 
 [patch.crates-io]
 noisy_float = { git = "https://github.com/SergiusIW/noisy_float-rs.git", rev = "c33a94803987475bbd205c9ff5a697af533f9a17" }
diff --git a/src/correlation.rs b/src/correlation.rs
@@ -0,0 +1,186 @@
+use ndarray::prelude::*;
+use ndarray::Data;
+use num_traits::{Float, FromPrimitive};
+
+pub trait CorrelationExt<A, S>
+where
+    S: Data<Elem = A>,
+{
+    /// Return the covariance matrix `C` for a 2-dimensional
+    /// array of observations `M`.
+    ///
+    /// Let `(r, o)` be the shape of `M`:
+    /// - `r` is the number of random variables;
+    /// - `o` is the number of observations we have collected 
+    /// for each random variable.
+    /// 
+    /// Every column in `M` is an experiment: a single observation for each 
+    /// random variable.
+    /// Each row in `M` contains all the observations for a certain random variable.
+    /// 
+    /// The parameter `ddof` specifies the "delta degrees of freedom". For
+    /// example, to calculate the population covariance, use `ddof = 0`, or to
+    /// calculate the sample covariance (unbiased estimate), use `ddof = 1`.
+    ///
+    /// The covariance of two random variables is defined as:
+    ///
+    /// ```text
+    ///                1       n
+    /// cov(X, Y) = ――――――――   ∑ (xᵢ - x̅)(yᵢ - y̅)
+    ///             n - ddof  i=1
+    /// ```
+    ///
+    /// where
+    ///
+    /// ```text
+    ///     1   n
+    /// x̅ = ―   ∑ xᵢ
+    ///     n  i=1
+    /// ```
+    /// and similarly for ̅y. 
+    ///
+    /// **Panics** if `ddof` is greater than or equal to the number of 
+    /// observations, if `M` is emtpy or if the type cast of `n_observations` 
+    /// from `usize` to `A` fails.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// extern crate ndarray;
+    /// extern crate ndarray_stats;
+    /// use ndarray::{aview2, arr2};
+    /// use ndarray_stats::CorrelationExt;
+    ///
+    /// let a = arr2(&[[1., 3., 5.],
+    ///                [2., 4., 6.]]);
+    /// let covariance = a.cov(1.);
+    /// assert_eq!(
+    ///    covariance, 
+    ///    aview2(&[[4., 4.], [4., 4.]])
+    /// );
+    /// ```
+    fn cov(&self, ddof: A) -> Array2<A> 
+    where
+        A: Float + FromPrimitive;
+}
+
+impl<A: 'static, S> CorrelationExt<A, S> for ArrayBase<S, Ix2>
+where
+    S: Data<Elem = A>,
+{
+    fn cov(&self, ddof: A) -> Array2<A>
+    where
+        A: Float + FromPrimitive,
+    {
+        let observation_axis = Axis(1);
+        let n_observations = A::from_usize(self.len_of(observation_axis)).unwrap();
+        let dof = 
+            if ddof >= n_observations {
+                panic!("`ddof` needs to be strictly smaller than the \
+                        number of observations provided for each \
+                        random variable!")
+            } else {
+                n_observations - ddof
+            };
+        let mean = self.mean_axis(observation_axis);
+        let denoised = self - &mean.insert_axis(observation_axis);
+        let covariance = denoised.dot(&denoised.t());
+        covariance.mapv_into(|x| x / dof)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use rand;
+    use rand::distributions::Range;
+    use ndarray_rand::RandomExt;
+
+    quickcheck! { 
+        fn constant_random_variables_have_zero_covariance_matrix(value: f64) -> bool {
+            let n_random_variables = 3;
+            let n_observations = 4;
+            let a = Array::from_elem((n_random_variables, n_observations), value);
+            a.cov(1.).all_close(
+                &Array::zeros((n_random_variables, n_random_variables)),
+                1e-8
+            )
+        }
+
+        fn covariance_matrix_is_symmetric(bound: f64) -> bool {
+            let n_random_variables = 3;
+            let n_observations = 4;
+            let a = Array::random(
+                (n_random_variables, n_observations), 
+                Range::new(-bound.abs(), bound.abs())
+            );
+            let covariance = a.cov(1.);
+            covariance.all_close(&covariance.t(), 1e-8)
+        }
+    }
+    
+    #[test]
+    #[should_panic]
+    fn test_invalid_ddof() {
+        let n_random_variables = 3;
+        let n_observations = 4;
+        let a = Array::random(
+            (n_random_variables, n_observations), 
+            Range::new(0., 10.)
+        );
+        let invalid_ddof = (n_observations as f64) + rand::random::<f64>().abs();
+        a.cov(invalid_ddof);
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_empty_matrix() {
+        let a: Array2<f32> = array![[], []];
+        // Negative ddof (-1 < 0) to avoid invalid-ddof panic 
+        a.cov(-1.);
+    }
+
+    #[test]
+    fn test_covariance_for_random_array() {
+        let a = array![
+            [ 0.72009497,  0.12568055,  0.55705966,  0.5959984 ,  0.69471457],
+            [ 0.56717131,  0.47619486,  0.21526298,  0.88915366,  0.91971245],
+            [ 0.59044195,  0.10720363,  0.76573717,  0.54693675,  0.95923036],
+            [ 0.24102952,  0.131347,  0.11118028,  0.21451351,  0.30515539],
+            [ 0.26952473,  0.93079841,  0.8080893 ,  0.42814155,  0.24642258]
+        ];
+        let numpy_covariance = array![
+            [ 0.05786248,  0.02614063,  0.06446215,  0.01285105, -0.06443992],
+            [ 0.02614063,  0.08733569,  0.02436933,  0.01977437, -0.06715555],
+            [ 0.06446215,  0.02436933,  0.10052129,  0.01393589, -0.06129912],
+            [ 0.01285105,  0.01977437,  0.01393589,  0.00638795, -0.02355557],
+            [-0.06443992, -0.06715555, -0.06129912, -0.02355557,  0.09909855]
+        ];
+        assert_eq!(a.ndim(), 2);
+        assert!(
+            a.cov(1.).all_close(
+                &numpy_covariance,
+                1e-8
+            )
+        );
+    }
+
+    #[test]
+    #[should_panic]
+    // We lose precision, hence the failing assert
+    fn test_covariance_for_badly_conditioned_array() {
+        let a: Array2<f64> = array![
+            [ 1e12 + 1.,  1e12 - 1.],
+            [ 1e-6 + 1e-12,  1e-6 - 1e-12],
+        ];
+        let expected_covariance = array![
+            [2., 2e-12], [2e-12, 2e-24]
+        ];
+        assert!(
+            a.cov(1.).all_close(
+                &expected_covariance,
+                1e-24
+            )
+        );
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,17 +1,22 @@
 #[macro_use(azip, s)]
+#[cfg_attr(test, macro_use(array))]
 extern crate ndarray;
 extern crate noisy_float;
 extern crate num_traits;
 extern crate rand;
 
+#[cfg(test)]
+extern crate ndarray_rand;
 #[cfg(test)]
 #[macro_use(quickcheck)]
 extern crate quickcheck;
 
 pub use maybe_nan::{MaybeNan, MaybeNanExt};
 pub use quantile::{interpolate, QuantileExt};
 pub use sort::Sort1dExt;
+pub use correlation::CorrelationExt;
 
 mod maybe_nan;
 mod quantile;
 mod sort;
+mod correlation;