From ab7a1908f72bb2e2744571112f22c4b927bff9ff Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Mon, 13 Apr 2020 16:24:20 +0000
Subject: [PATCH 01/40] Removed run_all_in_graph_and_eager_mode in
 attention_wrapper_test.py.

---
 .../seq2seq/tests/attention_wrapper_test.py   | 433 +++++++++---------
 1 file changed, 228 insertions(+), 205 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index ccfd259db4..c66e2ed3fe 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -16,6 +16,7 @@
 
 import collections
 
+import pytest
 from absl.testing import parameterized
 import numpy as np
 import tensorflow as tf
@@ -26,10 +27,8 @@
 from tensorflow_addons.seq2seq import sampler as sampler_py
 
 
-@test_utils.run_all_in_graph_and_eager_modes
-class AttentionMechanismTest(tf.test.TestCase, parameterized.TestCase):
-    def setUp(self):
-        super().setUp()
+class DummyData:
+    def __init__(self):
         self.batch = 10
         self.timestep = 5
         self.memory_size = 6
@@ -44,217 +43,241 @@ def setUp(self):
         self.query = np.random.randn(self.batch, self.units).astype(np.float32)
         self.state = np.random.randn(self.batch, self.timestep).astype(np.float32)
 
-    @parameterized.named_parameters(
-        ("luong", wrapper.LuongAttention),
-        ("luong_monotonic", wrapper.LuongMonotonicAttention),
-        ("bahdanau", wrapper.BahdanauAttention),
-        ("bahdanau_monotonic", wrapper.BahdanauMonotonicAttention),
-    )
-    def test_attention_shape_inference(self, attention_cls):
-        attention = attention_cls(self.units, self.memory)
-        attention_score = attention([self.query, self.state])
-        self.assertLen(attention_score, 2)
-        self.assertEqual(attention_score[0].shape, (self.batch, self.timestep))
-        self.assertEqual(attention_score[1].shape, (self.batch, self.timestep))
-
-    @parameterized.named_parameters(
-        ("luong", wrapper.LuongAttention),
-        ("luong_monotonic", wrapper.LuongMonotonicAttention),
-        ("bahdanau", wrapper.BahdanauAttention),
-        ("bahdanau_monotonic", wrapper.BahdanauMonotonicAttention),
-    )
-    def test_get_config(self, attention_cls):
-        attention = attention_cls(self.units, self.memory)
-        config = attention.get_config()
-
-        attention_from_config = attention_cls.from_config(config)
-        config_from_clone = attention_from_config.get_config()
 
-        self.assertDictEqual(config, config_from_clone)
-
-    @parameterized.named_parameters(
-        ("luong", wrapper.LuongAttention),
-        ("luong_monotonic", wrapper.LuongMonotonicAttention),
-        ("bahdanau", wrapper.BahdanauAttention),
-        ("bahdanau_monotonic", wrapper.BahdanauMonotonicAttention),
-    )
-    def test_layer_output(self, attention_cls):
-        attention = attention_cls(self.units, self.memory)
-        score = attention([self.query, self.state])
-        self.evaluate(tf.compat.v1.variables_initializer(attention.variables))
-
-        score_val = self.evaluate(score)
-        self.assertLen(score_val, 2)
-        self.assertEqual(score_val[0].shape, (self.batch, self.timestep))
-        self.assertEqual(score_val[1].shape, (self.batch, self.timestep))
-
-    @parameterized.named_parameters(
-        ("luong", wrapper.LuongAttention),
-        ("luong_monotonic", wrapper.LuongMonotonicAttention),
-        ("bahdanau", wrapper.BahdanauAttention),
-        ("bahdanau_monotonic", wrapper.BahdanauMonotonicAttention),
+@pytest.mark.parametrize(
+    "attention_cls",
+    [
+        wrapper.LuongAttention,
+        wrapper.LuongMonotonicAttention,
+        wrapper.BahdanauAttention,
+        wrapper.BahdanauMonotonicAttention,
+    ],
+)
+def test_attention_shape_inference(attention_cls):
+    dummy_data = DummyData()
+    attention = attention_cls(dummy_data.units, dummy_data.memory)
+    attention_score = attention([dummy_data.query, dummy_data.state])
+    assert len(attention_score) == 2
+    assert attention_score[0].shape == (dummy_data.batch, dummy_data.timestep)
+    assert attention_score[1].shape == (dummy_data.batch, dummy_data.timestep)
+
+
+@pytest.mark.parametrize(
+    "attention_cls",
+    [
+        wrapper.LuongAttention,
+        wrapper.LuongMonotonicAttention,
+        wrapper.BahdanauAttention,
+        wrapper.BahdanauMonotonicAttention,
+    ],
+)
+def test_get_config(attention_cls):
+    dummy_data = DummyData()
+    attention = attention_cls(dummy_data.units, dummy_data.memory)
+    config = attention.get_config()
+
+    attention_from_config = attention_cls.from_config(config)
+    config_from_clone = attention_from_config.get_config()
+
+    assert config == config_from_clone
+
+
+@pytest.mark.parametrize(
+    "attention_cls",
+    [
+        wrapper.LuongAttention,
+        wrapper.LuongMonotonicAttention,
+        wrapper.BahdanauAttention,
+        wrapper.BahdanauMonotonicAttention,
+    ],
+)
+def test_layer_output(attention_cls):
+    dummy_data = DummyData()
+    attention = attention_cls(dummy_data.units, dummy_data.memory)
+    score = attention([dummy_data.query, dummy_data.state])
+
+    assert len(score) == 2
+    assert score[0].shape == (dummy_data.batch, dummy_data.timestep)
+    assert score[1].shape == (dummy_data.batch, dummy_data.timestep)
+
+
+@pytest.mark.parametrize(
+    "attention_cls",
+    [
+        wrapper.LuongAttention,
+        wrapper.LuongMonotonicAttention,
+        wrapper.BahdanauAttention,
+        wrapper.BahdanauMonotonicAttention,
+    ],
+)
+def test_passing_memory_from_call(attention_cls):
+    dummy_data = DummyData()
+    attention = attention_cls(dummy_data.units, dummy_data.memory)
+    weights_before_query = attention.get_weights()
+    ref_score = attention([dummy_data.query, dummy_data.state])
+
+    all_weights = attention.get_weights()
+    config = attention.get_config()
+    # Simulate the twice invocation of calls here.
+    attention_from_config = attention_cls.from_config(config)
+    attention_from_config.build(dummy_data.memory.shape)
+    attention_from_config.set_weights(weights_before_query)
+    attention_from_config(dummy_data.memory, setup_memory=True)
+    attention_from_config.build([dummy_data.query.shape, dummy_data.state.shape])
+    attention_from_config.set_weights(all_weights)
+    score = attention_from_config([dummy_data.query, dummy_data.state])
+
+    np.testing.assert_allclose(ref_score, score)
+
+
+@pytest.mark.parametrize(
+    "attention_cls",
+    [
+        wrapper.LuongAttention,
+        wrapper.LuongMonotonicAttention,
+        wrapper.BahdanauAttention,
+        wrapper.BahdanauMonotonicAttention,
+    ],
+)
+def test_save_load_layer(attention_cls):
+    dummy_data = DummyData()
+    vocab = 20
+    embedding_dim = 6
+    inputs = tf.keras.Input(shape=[dummy_data.timestep])
+    encoder_input = tf.keras.layers.Embedding(vocab, embedding_dim, mask_zero=True)(
+        inputs
     )
-    def test_passing_memory_from_call(self, attention_cls):
-        attention = attention_cls(self.units, self.memory)
-        weights_before_query = attention.get_weights()
-        ref_score = attention([self.query, self.state])
-
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        ref_score_val = self.evaluate(ref_score)
-
-        all_weights = attention.get_weights()
-        config = attention.get_config()
-        # Simulate the twice invocation of calls here.
-        attention_from_config = attention_cls.from_config(config)
-        attention_from_config.build(self.memory.shape)
-        attention_from_config.set_weights(weights_before_query)
-        attention_from_config(self.memory, setup_memory=True)
-        attention_from_config.build([self.query.shape, self.state.shape])
-        attention_from_config.set_weights(all_weights)
-        score = attention_from_config([self.query, self.state])
-
-        score_val = self.evaluate(score)
-        self.assertAllClose(ref_score_val, score_val)
-
-    @parameterized.named_parameters(
-        ("luong", wrapper.LuongAttention),
-        ("luong_monotonic", wrapper.LuongMonotonicAttention),
-        ("bahdanau", wrapper.BahdanauAttention),
-        ("bahdanau_monotonic", wrapper.BahdanauMonotonicAttention),
+    encoder_output = tf.keras.layers.LSTM(
+        dummy_data.memory_size, return_sequences=True
+    )(encoder_input)
+
+    attention = attention_cls(dummy_data.units, encoder_output)
+    query = tf.keras.Input(shape=[dummy_data.units])
+    state = tf.keras.Input(shape=[dummy_data.timestep])
+
+    score = attention([query, state])
+
+    x_test = np.random.randint(vocab, size=(dummy_data.batch, dummy_data.timestep))
+    model = tf.keras.Model([inputs, query, state], score)
+    # Fall back to v1 style Keras training loop until issue with
+    # using outputs of a layer in another layer's constructor.
+    model.compile("rmsprop", "mse")
+    y_ref = model.predict_on_batch([x_test, dummy_data.query, dummy_data.state])
+
+    config = model.get_config()
+    weights = model.get_weights()
+    loaded_model = tf.keras.Model.from_config(
+        config, custom_objects={attention_cls.__name__: attention_cls}
     )
-    def test_save_load_layer(self, attention_cls):
-        vocab = 20
-        embedding_dim = 6
-        inputs = tf.keras.Input(shape=[self.timestep])
-        encoder_input = tf.keras.layers.Embedding(vocab, embedding_dim, mask_zero=True)(
-            inputs
-        )
-        encoder_output = tf.keras.layers.LSTM(self.memory_size, return_sequences=True)(
-            encoder_input
+    loaded_model.set_weights(weights)
+
+    # Fall back to v1 style Keras training loop until issue with
+    # using outputs of a layer in another layer's constructor.
+    loaded_model.compile("rmsprop", "mse")
+
+    y = loaded_model.predict_on_batch([x_test, dummy_data.query, dummy_data.state])
+
+    np.testing.assert_allclose(y_ref, y)
+
+
+@pytest.mark.parametrize(
+    "attention_cls",
+    [
+        wrapper.LuongAttention,
+        wrapper.LuongMonotonicAttention,
+        wrapper.BahdanauAttention,
+        wrapper.BahdanauMonotonicAttention,
+    ],
+)
+def test_manual_memory_reset(attention_cls):
+    dummy_data = DummyData()
+    attention = attention_cls(dummy_data.units)
+
+    def _compute_score(batch_size=None):
+        if batch_size is None:
+            batch_size = dummy_data.batch
+        memory = dummy_data.memory[:batch_size]
+        attention.setup_memory(
+            memory, memory_sequence_length=dummy_data.memory_length[:batch_size]
         )
-
-        attention = attention_cls(self.units, encoder_output)
-        query = tf.keras.Input(shape=[self.units])
-        state = tf.keras.Input(shape=[self.timestep])
-
-        score = attention([query, state])
-
-        x_test = np.random.randint(vocab, size=(self.batch, self.timestep))
-        model = tf.keras.Model([inputs, query, state], score)
-        # Fall back to v1 style Keras training loop until issue with
-        # using outputs of a layer in another layer's constructor.
+        assert attention.values.shape.as_list() == list(memory.shape)
+        assert attention.keys.shape.as_list() == list(memory.shape)[:-1] + [
+            dummy_data.units
+        ]
+        return attention([dummy_data.query[:batch_size], dummy_data.state[:batch_size]])
+
+    _compute_score(batch_size=dummy_data.batch)
+    variables = list(attention.variables)
+    _compute_score(batch_size=dummy_data.batch - 1)
+
+    # No new variables were created.
+    for var_1, var_2 in zip(variables, list(attention.variables)):
+        assert var_1 is var_2
+
+
+def test_masking():
+    memory = tf.ones([4, 4, 5], dtype=tf.float32)
+    memory_sequence_length = tf.constant([1, 2, 3, 4], dtype=tf.int32)
+    query = tf.ones([4, 5], dtype=tf.float32)
+    state = None
+    attention = wrapper.LuongAttention(5, memory, memory_sequence_length)
+    alignment, _ = attention([query, state])
+    assert np.sum(np.triu(alignment, k=1)) == 0
+
+
+@pytest.mark.parametrize(
+    "attention_cls",
+    [
+        wrapper.LuongAttention,
+        wrapper.LuongMonotonicAttention,
+        wrapper.BahdanauAttention,
+        wrapper.BahdanauMonotonicAttention,
+    ],
+)
+def test_memory_re_setup(attention_cls):
+    class MyModel(tf.keras.models.Model):
+        def __init__(self, vocab, embedding_dim, memory_size, units):
+            super().__init__()
+            self.emb = tf.keras.layers.Embedding(vocab, embedding_dim, mask_zero=True)
+            self.encoder = tf.keras.layers.LSTM(memory_size, return_sequences=True)
+            self.attn_mch = attention_cls(units)
+
+        def call(self, inputs):
+            enc_input, query, state = inputs
+            mask = self.emb.compute_mask(enc_input)
+            enc_input = self.emb(enc_input)
+            enc_output = self.encoder(enc_input, mask=mask)
+            # To ensure manual resetting also works in the graph mode,
+            # we call the attention mechanism twice.
+            self.attn_mch(enc_output, mask=mask, setup_memory=True)
+            self.attn_mch(enc_output, mask=mask, setup_memory=True)
+            score = self.attn_mch([query, state])
+            return score
+
+    vocab = 20
+    embedding_dim = 6
+    num_batches = 5
+
+    dummy_data = DummyData()
+    model = MyModel(vocab, embedding_dim, dummy_data.memory_size, dummy_data.units)
+    if tf.executing_eagerly():
+        model.compile("rmsprop", "mse", run_eagerly=True)
+    else:
         model.compile("rmsprop", "mse")
-        y_ref = model.predict_on_batch([x_test, self.query, self.state])
 
-        config = model.get_config()
-        weights = model.get_weights()
-        loaded_model = tf.keras.Model.from_config(
-            config, custom_objects={attention_cls.__name__: attention_cls}
-        )
-        loaded_model.set_weights(weights)
-
-        # Fall back to v1 style Keras training loop until issue with
-        # using outputs of a layer in another layer's constructor.
-        loaded_model.compile("rmsprop", "mse")
-
-        y = loaded_model.predict_on_batch([x_test, self.query, self.state])
-
-        self.assertAllClose(y_ref, y)
-
-    @parameterized.named_parameters(
-        ("luong", wrapper.LuongAttention),
-        ("luong_monotonic", wrapper.LuongMonotonicAttention),
-        ("bahdanau", wrapper.BahdanauAttention),
-        ("bahdanau_monotonic", wrapper.BahdanauMonotonicAttention),
+    x = np.random.randint(
+        vocab, size=(num_batches * dummy_data.batch, dummy_data.timestep)
     )
-    def test_manual_memory_reset(self, attention_cls):
-        attention = attention_cls(self.units)
-
-        def _compute_score(batch_size=None):
-            if batch_size is None:
-                batch_size = self.batch
-            memory = self.memory[:batch_size]
-            attention.setup_memory(
-                memory, memory_sequence_length=self.memory_length[:batch_size]
-            )
-            self.assertListEqual(attention.values.shape.as_list(), list(memory.shape))
-            self.assertListEqual(
-                attention.keys.shape.as_list(), list(memory.shape)[:-1] + [self.units]
-            )
-            return attention([self.query[:batch_size], self.state[:batch_size]])
-
-        score = _compute_score(batch_size=self.batch)
-        variables = list(attention.variables)
-        score = _compute_score(batch_size=self.batch - 1)
-
-        # No new variables were created.
-        for var_1, var_2 in zip(variables, list(attention.variables)):
-            self.assertIs(var_1, var_2)
-
-        # Score can be computed without errors.
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        self.evaluate(score)
-
-    def test_masking(self):
-        memory = tf.ones([4, 4, 5], dtype=tf.float32)
-        memory_sequence_length = tf.constant([1, 2, 3, 4], dtype=tf.int32)
-        query = tf.ones([4, 5], dtype=tf.float32)
-        state = None
-        attention = wrapper.LuongAttention(5, memory, memory_sequence_length)
-        alignment, _ = attention([query, state])
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        alignment = self.evaluate(alignment)
-        self.assertEqual(np.sum(np.triu(alignment, k=1)), 0)
-
-    @parameterized.named_parameters(
-        ("luong", wrapper.LuongAttention),
-        ("luong_monotonic", wrapper.LuongMonotonicAttention),
-        ("bahdanau", wrapper.BahdanauAttention),
-        ("bahdanau_monotonic", wrapper.BahdanauMonotonicAttention),
+    x_test = np.random.randint(
+        vocab, size=(num_batches * dummy_data.batch, dummy_data.timestep)
     )
-    def test_memory_re_setup(self, attention_cls):
-        class MyModel(tf.keras.models.Model):
-            def __init__(self, vocab, embedding_dim, memory_size, units):
-                super().__init__()
-                self.emb = tf.keras.layers.Embedding(
-                    vocab, embedding_dim, mask_zero=True
-                )
-                self.encoder = tf.keras.layers.LSTM(memory_size, return_sequences=True)
-                self.attn_mch = attention_cls(units)
-
-            def call(self, inputs):
-                enc_input, query, state = inputs
-                mask = self.emb.compute_mask(enc_input)
-                enc_input = self.emb(enc_input)
-                enc_output = self.encoder(enc_input, mask=mask)
-                # To ensure manual resetting also works in the graph mode,
-                # we call the attention mechanism twice.
-                self.attn_mch(enc_output, mask=mask, setup_memory=True)
-                self.attn_mch(enc_output, mask=mask, setup_memory=True)
-                score = self.attn_mch([query, state])
-                return score
-
-        vocab = 20
-        embedding_dim = 6
-        num_batches = 5
-
-        model = MyModel(vocab, embedding_dim, self.memory_size, self.units)
-        if tf.executing_eagerly():
-            model.compile("rmsprop", "mse", run_eagerly=True)
-        else:
-            model.compile("rmsprop", "mse")
-
-        x = np.random.randint(vocab, size=(num_batches * self.batch, self.timestep))
-        x_test = np.random.randint(
-            vocab, size=(num_batches * self.batch, self.timestep)
-        )
-        y = np.random.randn(num_batches * self.batch, self.timestep)
+    y = np.random.randn(num_batches * dummy_data.batch, dummy_data.timestep)
 
-        query = np.tile(self.query, [num_batches, 1])
-        state = np.tile(self.state, [num_batches, 1])
+    query = np.tile(dummy_data.query, [num_batches, 1])
+    state = np.tile(dummy_data.state, [num_batches, 1])
 
-        model.fit([x, query, state], (y, y), batch_size=self.batch)
-        model.predict_on_batch([x_test, query, state])
+    model.fit([x, query, state], (y, y), batch_size=dummy_data.batch)
+    model.predict_on_batch([x_test, query, state])
 
 
 class ResultSummary(

From 2c8153f8903908ee81b2df3ea10903442ae9add4 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Fri, 17 Apr 2020 10:45:14 +0000
Subject: [PATCH 02/40] Refactoring.

---
 .../seq2seq/tests/attention_wrapper_test.py   | 64 +++++--------------
 1 file changed, 15 insertions(+), 49 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index c66e2ed3fe..953c6f8818 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -44,14 +44,16 @@ def __init__(self):
         self.state = np.random.randn(self.batch, self.timestep).astype(np.float32)
 
 
+attention_classes = [
+    wrapper.LuongAttention,
+    wrapper.LuongMonotonicAttention,
+    wrapper.BahdanauAttention,
+    wrapper.BahdanauMonotonicAttention,
+]
+
+
 @pytest.mark.parametrize(
-    "attention_cls",
-    [
-        wrapper.LuongAttention,
-        wrapper.LuongMonotonicAttention,
-        wrapper.BahdanauAttention,
-        wrapper.BahdanauMonotonicAttention,
-    ],
+    "attention_cls", attention_classes,
 )
 def test_attention_shape_inference(attention_cls):
     dummy_data = DummyData()
@@ -63,13 +65,7 @@ def test_attention_shape_inference(attention_cls):
 
 
 @pytest.mark.parametrize(
-    "attention_cls",
-    [
-        wrapper.LuongAttention,
-        wrapper.LuongMonotonicAttention,
-        wrapper.BahdanauAttention,
-        wrapper.BahdanauMonotonicAttention,
-    ],
+    "attention_cls", attention_classes,
 )
 def test_get_config(attention_cls):
     dummy_data = DummyData()
@@ -83,13 +79,7 @@ def test_get_config(attention_cls):
 
 
 @pytest.mark.parametrize(
-    "attention_cls",
-    [
-        wrapper.LuongAttention,
-        wrapper.LuongMonotonicAttention,
-        wrapper.BahdanauAttention,
-        wrapper.BahdanauMonotonicAttention,
-    ],
+    "attention_cls", attention_classes,
 )
 def test_layer_output(attention_cls):
     dummy_data = DummyData()
@@ -102,13 +92,7 @@ def test_layer_output(attention_cls):
 
 
 @pytest.mark.parametrize(
-    "attention_cls",
-    [
-        wrapper.LuongAttention,
-        wrapper.LuongMonotonicAttention,
-        wrapper.BahdanauAttention,
-        wrapper.BahdanauMonotonicAttention,
-    ],
+    "attention_cls", attention_classes,
 )
 def test_passing_memory_from_call(attention_cls):
     dummy_data = DummyData()
@@ -131,13 +115,7 @@ def test_passing_memory_from_call(attention_cls):
 
 
 @pytest.mark.parametrize(
-    "attention_cls",
-    [
-        wrapper.LuongAttention,
-        wrapper.LuongMonotonicAttention,
-        wrapper.BahdanauAttention,
-        wrapper.BahdanauMonotonicAttention,
-    ],
+    "attention_cls", attention_classes,
 )
 def test_save_load_layer(attention_cls):
     dummy_data = DummyData()
@@ -181,13 +159,7 @@ def test_save_load_layer(attention_cls):
 
 
 @pytest.mark.parametrize(
-    "attention_cls",
-    [
-        wrapper.LuongAttention,
-        wrapper.LuongMonotonicAttention,
-        wrapper.BahdanauAttention,
-        wrapper.BahdanauMonotonicAttention,
-    ],
+    "attention_cls", attention_classes,
 )
 def test_manual_memory_reset(attention_cls):
     dummy_data = DummyData()
@@ -226,13 +198,7 @@ def test_masking():
 
 
 @pytest.mark.parametrize(
-    "attention_cls",
-    [
-        wrapper.LuongAttention,
-        wrapper.LuongMonotonicAttention,
-        wrapper.BahdanauAttention,
-        wrapper.BahdanauMonotonicAttention,
-    ],
+    "attention_cls", attention_classes,
 )
 def test_memory_re_setup(attention_cls):
     class MyModel(tf.keras.models.Model):

From c494cb861e8bfa01bee239f51dc5675c2e7a074b Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Fri, 17 Apr 2020 10:47:32 +0000
Subject: [PATCH 03/40] Some black bug.

---
 .../seq2seq/tests/attention_wrapper_test.py   | 28 +++++--------------
 1 file changed, 7 insertions(+), 21 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index 953c6f8818..54cf5bbbfc 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -52,9 +52,7 @@ def __init__(self):
 ]
 
 
-@pytest.mark.parametrize(
-    "attention_cls", attention_classes,
-)
+@pytest.mark.parametrize("attention_cls", attention_classes)
 def test_attention_shape_inference(attention_cls):
     dummy_data = DummyData()
     attention = attention_cls(dummy_data.units, dummy_data.memory)
@@ -64,9 +62,7 @@ def test_attention_shape_inference(attention_cls):
     assert attention_score[1].shape == (dummy_data.batch, dummy_data.timestep)
 
 
-@pytest.mark.parametrize(
-    "attention_cls", attention_classes,
-)
+@pytest.mark.parametrize("attention_cls", attention_classes)
 def test_get_config(attention_cls):
     dummy_data = DummyData()
     attention = attention_cls(dummy_data.units, dummy_data.memory)
@@ -78,9 +74,7 @@ def test_get_config(attention_cls):
     assert config == config_from_clone
 
 
-@pytest.mark.parametrize(
-    "attention_cls", attention_classes,
-)
+@pytest.mark.parametrize("attention_cls", attention_classes)
 def test_layer_output(attention_cls):
     dummy_data = DummyData()
     attention = attention_cls(dummy_data.units, dummy_data.memory)
@@ -91,9 +85,7 @@ def test_layer_output(attention_cls):
     assert score[1].shape == (dummy_data.batch, dummy_data.timestep)
 
 
-@pytest.mark.parametrize(
-    "attention_cls", attention_classes,
-)
+@pytest.mark.parametrize("attention_cls", attention_classes)
 def test_passing_memory_from_call(attention_cls):
     dummy_data = DummyData()
     attention = attention_cls(dummy_data.units, dummy_data.memory)
@@ -114,9 +106,7 @@ def test_passing_memory_from_call(attention_cls):
     np.testing.assert_allclose(ref_score, score)
 
 
-@pytest.mark.parametrize(
-    "attention_cls", attention_classes,
-)
+@pytest.mark.parametrize("attention_cls", attention_classes)
 def test_save_load_layer(attention_cls):
     dummy_data = DummyData()
     vocab = 20
@@ -158,9 +148,7 @@ def test_save_load_layer(attention_cls):
     np.testing.assert_allclose(y_ref, y)
 
 
-@pytest.mark.parametrize(
-    "attention_cls", attention_classes,
-)
+@pytest.mark.parametrize("attention_cls", attention_classes)
 def test_manual_memory_reset(attention_cls):
     dummy_data = DummyData()
     attention = attention_cls(dummy_data.units)
@@ -197,9 +185,7 @@ def test_masking():
     assert np.sum(np.triu(alignment, k=1)) == 0
 
 
-@pytest.mark.parametrize(
-    "attention_cls", attention_classes,
-)
+@pytest.mark.parametrize("attention_cls", attention_classes)
 def test_memory_re_setup(attention_cls):
     class MyModel(tf.keras.models.Model):
         def __init__(self, vocab, embedding_dim, memory_size, units):

From d20aff90ad16032a7ff97a264ca6570ba11e3725 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 08:36:34 +0000
Subject: [PATCH 04/40] Done one test.

---
 .../seq2seq/tests/attention_wrapper_test.py   | 82 +++++++++++++------
 1 file changed, 57 insertions(+), 25 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index 7793f0d304..76f2ee5809 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -244,6 +244,63 @@ def get_result_summary(x):
     return x
 
 
+def assert_allclose_or_equal(x, y, **kwargs):
+    if isinstance(x, np.ndarray) or isinstance(x, float):
+        np.testing.assert_allclose(x, y, atol=1e-3, **kwargs)
+    else:
+        assert x == y
+
+
+class DummyData2:
+    def __init__(self):
+        self.batch = 64
+        self.units = 128
+        self.encoder_timestep = 10
+        self.encoder_dim = 256
+        self.decoder_timestep = 12
+        self.encoder_outputs = np.random.randn(
+            self.batch, self.encoder_timestep, self.encoder_dim
+        )
+        self.encoder_sequence_length = np.random.randint(
+            1, high=self.encoder_timestep, size=(self.batch,)
+        ).astype(np.int32)
+        self.decoder_inputs = np.random.randn(
+            self.batch, self.decoder_timestep, self.units
+        )
+        self.decoder_sequence_length = np.random.randint(
+            self.decoder_timestep, size=(self.batch,)
+        ).astype(np.int32)
+
+
+def test_custom_attention_layer():
+    dummy_data = DummyData2()
+    attention_mechanism = wrapper.LuongAttention(dummy_data.units)
+    cell = tf.keras.layers.LSTMCell(dummy_data.units)
+    attention_layer = tf.keras.layers.Dense(
+        dummy_data.units * 2, use_bias=False, activation=tf.math.tanh
+    )
+    attention_wrapper = wrapper.AttentionWrapper(
+        cell, attention_mechanism, attention_layer=attention_layer
+    )
+    with pytest.raises(ValueError):
+        # Should fail because the attention mechanism has not been
+        # initialized.
+        attention_wrapper.get_initial_state(
+            batch_size=dummy_data.batch, dtype=tf.float32
+        )
+    attention_mechanism.setup_memory(
+        dummy_data.encoder_outputs.astype(np.float32),
+        memory_sequence_length=dummy_data.encoder_sequence_length,
+    )
+    initial_state = attention_wrapper.get_initial_state(
+        batch_size=dummy_data.batch, dtype=tf.float32
+    )
+    assert initial_state.attention.shape[-1] == dummy_data.units * 2
+    first_input = dummy_data.decoder_inputs[:, 0].astype(np.float32)
+    output, _ = attention_wrapper(first_input, initial_state)
+    assert output.shape[-1] == dummy_data.units * 2
+
+
 @test_utils.run_all_in_graph_and_eager_modes
 class AttentionWrapperTest(tf.test.TestCase, parameterized.TestCase):
     def assertAllCloseOrEqual(self, x, y, **kwargs):
@@ -272,31 +329,6 @@ def setUp(self):
             self.decoder_timestep, size=(self.batch,)
         ).astype(np.int32)
 
-    def testCustomAttentionLayer(self):
-        attention_mechanism = wrapper.LuongAttention(self.units)
-        cell = tf.keras.layers.LSTMCell(self.units)
-        attention_layer = tf.keras.layers.Dense(
-            self.units * 2, use_bias=False, activation=tf.math.tanh
-        )
-        attention_wrapper = wrapper.AttentionWrapper(
-            cell, attention_mechanism, attention_layer=attention_layer
-        )
-        with self.assertRaises(ValueError):
-            # Should fail because the attention mechanism has not been
-            # initialized.
-            attention_wrapper.get_initial_state(batch_size=self.batch, dtype=tf.float32)
-        attention_mechanism.setup_memory(
-            self.encoder_outputs.astype(np.float32),
-            memory_sequence_length=self.encoder_sequence_length,
-        )
-        initial_state = attention_wrapper.get_initial_state(
-            batch_size=self.batch, dtype=tf.float32
-        )
-        self.assertEqual(initial_state.attention.shape[-1], self.units * 2)
-        first_input = self.decoder_inputs[:, 0].astype(np.float32)
-        output, _ = attention_wrapper(first_input, initial_state)
-        self.assertEqual(output.shape[-1], self.units * 2)
-
     def _testWithAttention(
         self,
         create_attention_mechanism,

From 9f62a5b3170a5140c222d26c612284eba889e1f5 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 08:41:21 +0000
Subject: [PATCH 05/40] Fuse functions.

---
 .../seq2seq/tests/attention_wrapper_test.py   | 33 ++-----------------
 1 file changed, 3 insertions(+), 30 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index 76f2ee5809..f98671b165 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -347,36 +347,9 @@ def _testWithAttention(
             [attention_layer_size] if attention_layer_size is not None else None
         )
         attention_layers = [attention_layer] if attention_layer is not None else None
-        self._testWithMaybeMultiAttention(
-            is_multi=False,
-            create_attention_mechanisms=[create_attention_mechanism],
-            expected_final_output=expected_final_output,
-            expected_final_state=expected_final_state,
-            attention_mechanism_depths=[attention_mechanism_depth],
-            alignment_history=alignment_history,
-            expected_final_alignment_history=expected_final_alignment_history,
-            attention_layer_sizes=attention_layer_sizes,
-            attention_layers=attention_layers,
-            create_query_layer=create_query_layer,
-            create_memory_layer=create_memory_layer,
-            create_attention_kwargs=create_attention_kwargs,
-        )
-
-    def _testWithMaybeMultiAttention(
-        self,
-        is_multi,
-        create_attention_mechanisms,
-        expected_final_output,
-        expected_final_state,
-        attention_mechanism_depths,
-        alignment_history=False,
-        expected_final_alignment_history=None,
-        attention_layer_sizes=None,
-        attention_layers=None,
-        create_query_layer=False,
-        create_memory_layer=True,
-        create_attention_kwargs=None,
-    ):
+        create_attention_mechanisms = [create_attention_mechanism]
+        attention_mechanism_depths = [attention_mechanism_depth]
+        is_multi = False
         # Allow is_multi to be True with a single mechanism to enable test for
         # passing in a single mechanism in a list.
         assert len(create_attention_mechanisms) == 1 or is_multi

From 72fb086ed80197e6ce57e22f3a08ade0a32c6833 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 08:45:13 +0000
Subject: [PATCH 06/40] Rewrite with unittest2pytest.

---
 .../seq2seq/tests/attention_wrapper_test.py   | 51 ++++++++-----------
 1 file changed, 22 insertions(+), 29 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index f98671b165..f0abfd65d5 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -452,32 +452,27 @@ def _testWithAttention(
                 sequence_length=decoder_sequence_length,
             )
 
-            self.assertIsInstance(final_outputs, basic_decoder.BasicDecoderOutput)
-            self.assertIsInstance(final_state, wrapper.AttentionWrapperState)
+            assert isinstance(final_outputs, basic_decoder.BasicDecoderOutput)
+            assert isinstance(final_state, wrapper.AttentionWrapperState)
 
             expected_time = (
                 max(decoder_sequence_length) if tf.executing_eagerly() else None
             )
-            self.assertEqual(
-                (batch_size, expected_time, attention_depth),
-                tuple(final_outputs.rnn_output.get_shape().as_list()),
+            assert (batch_size, expected_time, attention_depth) == tuple(
+                final_outputs.rnn_output.get_shape().as_list()
             )
-            self.assertEqual(
-                (batch_size, expected_time),
-                tuple(final_outputs.sample_id.get_shape().as_list()),
+            assert (batch_size, expected_time) == tuple(
+                final_outputs.sample_id.get_shape().as_list()
             )
 
-            self.assertEqual(
-                (batch_size, attention_depth),
-                tuple(final_state.attention.get_shape().as_list()),
+            assert (batch_size, attention_depth) == tuple(
+                final_state.attention.get_shape().as_list()
             )
-            self.assertEqual(
-                (batch_size, cell_depth),
-                tuple(final_state.cell_state[0].get_shape().as_list()),
+            assert (batch_size, cell_depth) == tuple(
+                final_state.cell_state[0].get_shape().as_list()
             )
-            self.assertEqual(
-                (batch_size, cell_depth),
-                tuple(final_state.cell_state[1].get_shape().as_list()),
+            assert (batch_size, cell_depth) == tuple(
+                final_state.cell_state[1].get_shape().as_list()
             )
 
             if alignment_history:
@@ -485,17 +480,15 @@ def _testWithAttention(
                     state_alignment_history = []
                     for history_array in final_state.alignment_history:
                         history = history_array.stack()
-                        self.assertEqual(
-                            (expected_time, batch_size, encoder_max_time),
-                            tuple(history.get_shape().as_list()),
+                        assert (expected_time, batch_size, encoder_max_time) == tuple(
+                            history.get_shape().as_list()
                         )
                         state_alignment_history.append(history)
                     state_alignment_history = tuple(state_alignment_history)
                 else:
                     state_alignment_history = final_state.alignment_history.stack()
-                    self.assertEqual(
-                        (expected_time, batch_size, encoder_max_time),
-                        tuple(state_alignment_history.get_shape().as_list()),
+                    assert (expected_time, batch_size, encoder_max_time) == tuple(
+                        state_alignment_history.get_shape().as_list()
                     )
                 tf.nest.assert_same_structure(
                     cell.state_size,
@@ -571,9 +564,9 @@ def testBahdanauNormalizedDType(self, dtype):
             initial_state=cell.get_initial_state(batch_size=self.batch, dtype=dtype),
             sequence_length=self.decoder_sequence_length,
         )
-        self.assertIsInstance(final_outputs, basic_decoder.BasicDecoderOutput)
-        self.assertEqual(final_outputs.rnn_output.dtype, dtype)
-        self.assertIsInstance(final_state, wrapper.AttentionWrapperState)
+        assert isinstance(final_outputs, basic_decoder.BasicDecoderOutput)
+        assert final_outputs.rnn_output.dtype == dtype
+        assert isinstance(final_state, wrapper.AttentionWrapperState)
 
     @parameterized.parameters([np.float32, np.float64])
     def testLuongScaledDType(self, dtype):
@@ -600,9 +593,9 @@ def testLuongScaledDType(self, dtype):
             initial_state=cell.get_initial_state(batch_size=self.batch, dtype=dtype),
             sequence_length=self.decoder_sequence_length,
         )
-        self.assertIsInstance(final_outputs, basic_decoder.BasicDecoderOutput)
-        self.assertEqual(final_outputs.rnn_output.dtype, dtype)
-        self.assertIsInstance(final_state, wrapper.AttentionWrapperState)
+        assert isinstance(final_outputs, basic_decoder.BasicDecoderOutput)
+        assert final_outputs.rnn_output.dtype == dtype
+        assert isinstance(final_state, wrapper.AttentionWrapperState)
 
     def testBahdanauNotNormalized(self):
         create_attention_mechanism = wrapper.BahdanauAttention

From db01ae078136f250cfd94818c22f7a15f874c789 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 08:52:11 +0000
Subject: [PATCH 07/40] Moved function.

---
 .../seq2seq/tests/attention_wrapper_test.py   | 444 +++++++++---------
 1 file changed, 224 insertions(+), 220 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index f0abfd65d5..d77a7b9c7a 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -301,6 +301,212 @@ def test_custom_attention_layer():
     assert output.shape[-1] == dummy_data.units * 2
 
 
+def _test_with_attention(
+    self,
+    create_attention_mechanism,
+    expected_final_output,
+    expected_final_state,
+    attention_mechanism_depth=3,
+    alignment_history=False,
+    expected_final_alignment_history=None,
+    attention_layer_size=6,
+    attention_layer=None,
+    create_query_layer=False,
+    create_memory_layer=True,
+    create_attention_kwargs=None,
+):
+    attention_layer_sizes = (
+        [attention_layer_size] if attention_layer_size is not None else None
+    )
+    attention_layers = [attention_layer] if attention_layer is not None else None
+    create_attention_mechanisms = [create_attention_mechanism]
+    attention_mechanism_depths = [attention_mechanism_depth]
+    is_multi = False
+    # Allow is_multi to be True with a single mechanism to enable test for
+    # passing in a single mechanism in a list.
+    assert len(create_attention_mechanisms) == 1 or is_multi
+    encoder_sequence_length = [3, 2, 3, 1, 1]
+    decoder_sequence_length = [2, 0, 1, 2, 3]
+    batch_size = 5
+    encoder_max_time = 8
+    decoder_max_time = 4
+    input_depth = 7
+    encoder_output_depth = 10
+    cell_depth = 9
+    create_attention_kwargs = create_attention_kwargs or {}
+
+    if attention_layer_sizes is not None:
+        # Compute sum of attention_layer_sizes. Use encoder_output_depth if
+        # None.
+        attention_depth = sum(
+            attention_layer_size or encoder_output_depth
+            for attention_layer_size in attention_layer_sizes
+        )
+    elif attention_layers is not None:
+        # Compute sum of attention_layers output depth.
+        attention_depth = sum(
+            attention_layer.compute_output_shape(
+                [batch_size, cell_depth + encoder_output_depth]
+            )
+            .dims[-1]
+            .value
+            for attention_layer in attention_layers
+        )
+    else:
+        attention_depth = encoder_output_depth * len(create_attention_mechanisms)
+
+    decoder_inputs = np.random.randn(batch_size, decoder_max_time, input_depth).astype(
+        np.float32
+    )
+    encoder_outputs = np.random.randn(
+        batch_size, encoder_max_time, encoder_output_depth
+    ).astype(np.float32)
+
+    attention_mechanisms = []
+    for creator, depth in zip(create_attention_mechanisms, attention_mechanism_depths):
+        # Create a memory layer with deterministic initializer to avoid
+        # randomness in the test between graph and eager.
+        if create_query_layer:
+            create_attention_kwargs["query_layer"] = tf.keras.layers.Dense(
+                depth, kernel_initializer="ones", use_bias=False
+            )
+        if create_memory_layer:
+            create_attention_kwargs["memory_layer"] = tf.keras.layers.Dense(
+                depth, kernel_initializer="ones", use_bias=False
+            )
+
+        attention_mechanisms.append(
+            creator(
+                units=depth,
+                memory=encoder_outputs,
+                memory_sequence_length=encoder_sequence_length,
+                **create_attention_kwargs,
+            )
+        )
+
+    with self.cached_session(use_gpu=True):
+        attention_layer_size = attention_layer_sizes
+        attention_layer = attention_layers
+        if not is_multi:
+            if attention_layer_size is not None:
+                attention_layer_size = attention_layer_size[0]
+            if attention_layer is not None:
+                attention_layer = attention_layer[0]
+        cell = tf.keras.layers.LSTMCell(
+            cell_depth,
+            recurrent_activation="sigmoid",
+            kernel_initializer="ones",
+            recurrent_initializer="ones",
+        )
+        cell = wrapper.AttentionWrapper(
+            cell,
+            attention_mechanisms if is_multi else attention_mechanisms[0],
+            attention_layer_size=attention_layer_size,
+            alignment_history=alignment_history,
+            attention_layer=attention_layer,
+        )
+        if cell._attention_layers is not None:
+            for layer in cell._attention_layers:
+                layer.kernel_initializer = tf.compat.v1.keras.initializers.glorot_uniform(
+                    seed=1337
+                )
+
+        sampler = sampler_py.TrainingSampler()
+        my_decoder = basic_decoder.BasicDecoder(cell=cell, sampler=sampler)
+        initial_state = cell.get_initial_state(dtype=tf.float32, batch_size=batch_size)
+        final_outputs, final_state, _ = my_decoder(
+            decoder_inputs,
+            initial_state=initial_state,
+            sequence_length=decoder_sequence_length,
+        )
+
+        assert isinstance(final_outputs, basic_decoder.BasicDecoderOutput)
+        assert isinstance(final_state, wrapper.AttentionWrapperState)
+
+        expected_time = max(decoder_sequence_length) if tf.executing_eagerly() else None
+        assert (batch_size, expected_time, attention_depth) == tuple(
+            final_outputs.rnn_output.get_shape().as_list()
+        )
+        assert (batch_size, expected_time) == tuple(
+            final_outputs.sample_id.get_shape().as_list()
+        )
+
+        assert (batch_size, attention_depth) == tuple(
+            final_state.attention.get_shape().as_list()
+        )
+        assert (batch_size, cell_depth) == tuple(
+            final_state.cell_state[0].get_shape().as_list()
+        )
+        assert (batch_size, cell_depth) == tuple(
+            final_state.cell_state[1].get_shape().as_list()
+        )
+
+        if alignment_history:
+            if is_multi:
+                state_alignment_history = []
+                for history_array in final_state.alignment_history:
+                    history = history_array.stack()
+                    assert (expected_time, batch_size, encoder_max_time) == tuple(
+                        history.get_shape().as_list()
+                    )
+                    state_alignment_history.append(history)
+                state_alignment_history = tuple(state_alignment_history)
+            else:
+                state_alignment_history = final_state.alignment_history.stack()
+                assert (expected_time, batch_size, encoder_max_time) == tuple(
+                    state_alignment_history.get_shape().as_list()
+                )
+            tf.nest.assert_same_structure(
+                cell.state_size,
+                cell.get_initial_state(batch_size=batch_size, dtype=tf.float32),
+            )
+            # Remove the history from final_state for purposes of the
+            # remainder of the tests.
+            final_state = final_state._replace(
+                alignment_history=()
+            )  # pylint: disable=protected-access
+        else:
+            state_alignment_history = ()
+
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        eval_result = self.evaluate(
+            {
+                "final_outputs": final_outputs,
+                "final_state": final_state,
+                "state_alignment_history": state_alignment_history,
+            }
+        )
+
+        final_output_info = tf.nest.map_structure(
+            get_result_summary, eval_result["final_outputs"]
+        )
+        final_state_info = tf.nest.map_structure(
+            get_result_summary, eval_result["final_state"]
+        )
+        print("final_output_info: ", final_output_info)
+        print("final_state_info: ", final_state_info)
+
+        tf.nest.map_structure(
+            self.assertAllCloseOrEqual, expected_final_output, final_output_info
+        )
+        tf.nest.map_structure(
+            self.assertAllCloseOrEqual, expected_final_state, final_state_info
+        )
+        # by default, the wrapper emits attention as output
+        if alignment_history:
+            final_alignment_history_info = tf.nest.map_structure(
+                get_result_summary, eval_result["state_alignment_history"]
+            )
+            print("final_alignment_history_info: ", final_alignment_history_info)
+            tf.nest.map_structure(
+                self.assertAllCloseOrEqual,
+                # outputs are batch major but the stacked TensorArray is
+                # time major
+                expected_final_alignment_history,
+                final_alignment_history_info,
+            )
+
+
 @test_utils.run_all_in_graph_and_eager_modes
 class AttentionWrapperTest(tf.test.TestCase, parameterized.TestCase):
     def assertAllCloseOrEqual(self, x, y, **kwargs):
@@ -329,217 +535,6 @@ def setUp(self):
             self.decoder_timestep, size=(self.batch,)
         ).astype(np.int32)
 
-    def _testWithAttention(
-        self,
-        create_attention_mechanism,
-        expected_final_output,
-        expected_final_state,
-        attention_mechanism_depth=3,
-        alignment_history=False,
-        expected_final_alignment_history=None,
-        attention_layer_size=6,
-        attention_layer=None,
-        create_query_layer=False,
-        create_memory_layer=True,
-        create_attention_kwargs=None,
-    ):
-        attention_layer_sizes = (
-            [attention_layer_size] if attention_layer_size is not None else None
-        )
-        attention_layers = [attention_layer] if attention_layer is not None else None
-        create_attention_mechanisms = [create_attention_mechanism]
-        attention_mechanism_depths = [attention_mechanism_depth]
-        is_multi = False
-        # Allow is_multi to be True with a single mechanism to enable test for
-        # passing in a single mechanism in a list.
-        assert len(create_attention_mechanisms) == 1 or is_multi
-        encoder_sequence_length = [3, 2, 3, 1, 1]
-        decoder_sequence_length = [2, 0, 1, 2, 3]
-        batch_size = 5
-        encoder_max_time = 8
-        decoder_max_time = 4
-        input_depth = 7
-        encoder_output_depth = 10
-        cell_depth = 9
-        create_attention_kwargs = create_attention_kwargs or {}
-
-        if attention_layer_sizes is not None:
-            # Compute sum of attention_layer_sizes. Use encoder_output_depth if
-            # None.
-            attention_depth = sum(
-                attention_layer_size or encoder_output_depth
-                for attention_layer_size in attention_layer_sizes
-            )
-        elif attention_layers is not None:
-            # Compute sum of attention_layers output depth.
-            attention_depth = sum(
-                attention_layer.compute_output_shape(
-                    [batch_size, cell_depth + encoder_output_depth]
-                )
-                .dims[-1]
-                .value
-                for attention_layer in attention_layers
-            )
-        else:
-            attention_depth = encoder_output_depth * len(create_attention_mechanisms)
-
-        decoder_inputs = np.random.randn(
-            batch_size, decoder_max_time, input_depth
-        ).astype(np.float32)
-        encoder_outputs = np.random.randn(
-            batch_size, encoder_max_time, encoder_output_depth
-        ).astype(np.float32)
-
-        attention_mechanisms = []
-        for creator, depth in zip(
-            create_attention_mechanisms, attention_mechanism_depths
-        ):
-            # Create a memory layer with deterministic initializer to avoid
-            # randomness in the test between graph and eager.
-            if create_query_layer:
-                create_attention_kwargs["query_layer"] = tf.keras.layers.Dense(
-                    depth, kernel_initializer="ones", use_bias=False
-                )
-            if create_memory_layer:
-                create_attention_kwargs["memory_layer"] = tf.keras.layers.Dense(
-                    depth, kernel_initializer="ones", use_bias=False
-                )
-
-            attention_mechanisms.append(
-                creator(
-                    units=depth,
-                    memory=encoder_outputs,
-                    memory_sequence_length=encoder_sequence_length,
-                    **create_attention_kwargs,
-                )
-            )
-
-        with self.cached_session(use_gpu=True):
-            attention_layer_size = attention_layer_sizes
-            attention_layer = attention_layers
-            if not is_multi:
-                if attention_layer_size is not None:
-                    attention_layer_size = attention_layer_size[0]
-                if attention_layer is not None:
-                    attention_layer = attention_layer[0]
-            cell = tf.keras.layers.LSTMCell(
-                cell_depth,
-                recurrent_activation="sigmoid",
-                kernel_initializer="ones",
-                recurrent_initializer="ones",
-            )
-            cell = wrapper.AttentionWrapper(
-                cell,
-                attention_mechanisms if is_multi else attention_mechanisms[0],
-                attention_layer_size=attention_layer_size,
-                alignment_history=alignment_history,
-                attention_layer=attention_layer,
-            )
-            if cell._attention_layers is not None:
-                for layer in cell._attention_layers:
-                    layer.kernel_initializer = tf.compat.v1.keras.initializers.glorot_uniform(
-                        seed=1337
-                    )
-
-            sampler = sampler_py.TrainingSampler()
-            my_decoder = basic_decoder.BasicDecoder(cell=cell, sampler=sampler)
-            initial_state = cell.get_initial_state(
-                dtype=tf.float32, batch_size=batch_size
-            )
-            final_outputs, final_state, _ = my_decoder(
-                decoder_inputs,
-                initial_state=initial_state,
-                sequence_length=decoder_sequence_length,
-            )
-
-            assert isinstance(final_outputs, basic_decoder.BasicDecoderOutput)
-            assert isinstance(final_state, wrapper.AttentionWrapperState)
-
-            expected_time = (
-                max(decoder_sequence_length) if tf.executing_eagerly() else None
-            )
-            assert (batch_size, expected_time, attention_depth) == tuple(
-                final_outputs.rnn_output.get_shape().as_list()
-            )
-            assert (batch_size, expected_time) == tuple(
-                final_outputs.sample_id.get_shape().as_list()
-            )
-
-            assert (batch_size, attention_depth) == tuple(
-                final_state.attention.get_shape().as_list()
-            )
-            assert (batch_size, cell_depth) == tuple(
-                final_state.cell_state[0].get_shape().as_list()
-            )
-            assert (batch_size, cell_depth) == tuple(
-                final_state.cell_state[1].get_shape().as_list()
-            )
-
-            if alignment_history:
-                if is_multi:
-                    state_alignment_history = []
-                    for history_array in final_state.alignment_history:
-                        history = history_array.stack()
-                        assert (expected_time, batch_size, encoder_max_time) == tuple(
-                            history.get_shape().as_list()
-                        )
-                        state_alignment_history.append(history)
-                    state_alignment_history = tuple(state_alignment_history)
-                else:
-                    state_alignment_history = final_state.alignment_history.stack()
-                    assert (expected_time, batch_size, encoder_max_time) == tuple(
-                        state_alignment_history.get_shape().as_list()
-                    )
-                tf.nest.assert_same_structure(
-                    cell.state_size,
-                    cell.get_initial_state(batch_size=batch_size, dtype=tf.float32),
-                )
-                # Remove the history from final_state for purposes of the
-                # remainder of the tests.
-                final_state = final_state._replace(
-                    alignment_history=()
-                )  # pylint: disable=protected-access
-            else:
-                state_alignment_history = ()
-
-            self.evaluate(tf.compat.v1.global_variables_initializer())
-            eval_result = self.evaluate(
-                {
-                    "final_outputs": final_outputs,
-                    "final_state": final_state,
-                    "state_alignment_history": state_alignment_history,
-                }
-            )
-
-            final_output_info = tf.nest.map_structure(
-                get_result_summary, eval_result["final_outputs"]
-            )
-            final_state_info = tf.nest.map_structure(
-                get_result_summary, eval_result["final_state"]
-            )
-            print("final_output_info: ", final_output_info)
-            print("final_state_info: ", final_state_info)
-
-            tf.nest.map_structure(
-                self.assertAllCloseOrEqual, expected_final_output, final_output_info
-            )
-            tf.nest.map_structure(
-                self.assertAllCloseOrEqual, expected_final_state, final_state_info
-            )
-            # by default, the wrapper emits attention as output
-            if alignment_history:
-                final_alignment_history_info = tf.nest.map_structure(
-                    get_result_summary, eval_result["state_alignment_history"]
-                )
-                print("final_alignment_history_info: ", final_alignment_history_info)
-                tf.nest.map_structure(
-                    self.assertAllCloseOrEqual,
-                    # outputs are batch major but the stacked TensorArray is
-                    # time major
-                    expected_final_alignment_history,
-                    final_alignment_history_info,
-                )
-
     @parameterized.parameters([np.float32, np.float64])
     def testBahdanauNormalizedDType(self, dtype):
         encoder_outputs = self.encoder_outputs.astype(dtype)
@@ -630,7 +625,8 @@ def testBahdanauNotNormalized(self):
             shape=(3, 5, 8), dtype=np.dtype(np.float32), mean=0.125
         )
 
-        self._testWithAttention(
+        _test_with_attention(
+            self,
             create_attention_mechanism,
             expected_final_output,
             expected_final_state,
@@ -667,7 +663,8 @@ def testBahdanauNormalized(self):
             alignment_history=(),
         )
 
-        self._testWithAttention(
+        _test_with_attention(
+            self,
             create_attention_mechanism,
             expected_final_output,
             expected_final_state,
@@ -703,7 +700,8 @@ def testLuongNotNormalized(self):
             alignment_history=(),
         )
 
-        self._testWithAttention(
+        _test_with_attention(
+            self,
             create_attention_mechanism,
             expected_final_output,
             expected_final_state,
@@ -739,7 +737,8 @@ def testLuongScaled(self):
             alignment_history=(),
         )
 
-        self._testWithAttention(
+        _test_with_attention(
+            self,
             create_attention_mechanism,
             expected_final_output,
             expected_final_state,
@@ -774,7 +773,8 @@ def testNotUseAttentionLayer(self):
             alignment_history=(),
         )
 
-        self._testWithAttention(
+        _test_with_attention(
+            self,
             create_attention_mechanism,
             expected_final_output,
             expected_final_state,
@@ -815,7 +815,8 @@ def testBahdanauMonotonicNotNormalized(self):
             shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.10261579603
         )
 
-        self._testWithAttention(
+        _test_with_attention(
+            self,
             create_attention_mechanism,
             expected_final_output,
             expected_final_state,
@@ -856,7 +857,8 @@ def testBahdanauMonotonicNormalized(self):
             shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.07909643
         )
 
-        self._testWithAttention(
+        _test_with_attention(
+            self,
             create_attention_mechanism,
             expected_final_output,
             expected_final_state,
@@ -897,7 +899,8 @@ def testLuongMonotonicNotNormalized(self):
             shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.06994973868
         )
 
-        self._testWithAttention(
+        _test_with_attention(
+            self,
             create_attention_mechanism,
             expected_final_output,
             expected_final_state,
@@ -938,7 +941,8 @@ def testLuongMonotonicScaled(self):
             shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.06994973868
         )
 
-        self._testWithAttention(
+        _test_with_attention(
+            self,
             create_attention_mechanism,
             expected_final_output,
             expected_final_state,

From 0ff4ea2c35d9ad946150ed59246b68720f1d9b4f Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 08:55:35 +0000
Subject: [PATCH 08/40] Removed use gpu.

---
 .../seq2seq/tests/attention_wrapper_test.py   | 215 +++++++++---------
 1 file changed, 107 insertions(+), 108 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index d77a7b9c7a..4135ecb60d 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -384,127 +384,126 @@ def _test_with_attention(
             )
         )
 
-    with self.cached_session(use_gpu=True):
-        attention_layer_size = attention_layer_sizes
-        attention_layer = attention_layers
-        if not is_multi:
-            if attention_layer_size is not None:
-                attention_layer_size = attention_layer_size[0]
-            if attention_layer is not None:
-                attention_layer = attention_layer[0]
-        cell = tf.keras.layers.LSTMCell(
-            cell_depth,
-            recurrent_activation="sigmoid",
-            kernel_initializer="ones",
-            recurrent_initializer="ones",
-        )
-        cell = wrapper.AttentionWrapper(
-            cell,
-            attention_mechanisms if is_multi else attention_mechanisms[0],
-            attention_layer_size=attention_layer_size,
-            alignment_history=alignment_history,
-            attention_layer=attention_layer,
-        )
-        if cell._attention_layers is not None:
-            for layer in cell._attention_layers:
-                layer.kernel_initializer = tf.compat.v1.keras.initializers.glorot_uniform(
-                    seed=1337
-                )
+    attention_layer_size = attention_layer_sizes
+    attention_layer = attention_layers
+    if not is_multi:
+        if attention_layer_size is not None:
+            attention_layer_size = attention_layer_size[0]
+        if attention_layer is not None:
+            attention_layer = attention_layer[0]
+    cell = tf.keras.layers.LSTMCell(
+        cell_depth,
+        recurrent_activation="sigmoid",
+        kernel_initializer="ones",
+        recurrent_initializer="ones",
+    )
+    cell = wrapper.AttentionWrapper(
+        cell,
+        attention_mechanisms if is_multi else attention_mechanisms[0],
+        attention_layer_size=attention_layer_size,
+        alignment_history=alignment_history,
+        attention_layer=attention_layer,
+    )
+    if cell._attention_layers is not None:
+        for layer in cell._attention_layers:
+            layer.kernel_initializer = tf.compat.v1.keras.initializers.glorot_uniform(
+                seed=1337
+            )
 
-        sampler = sampler_py.TrainingSampler()
-        my_decoder = basic_decoder.BasicDecoder(cell=cell, sampler=sampler)
-        initial_state = cell.get_initial_state(dtype=tf.float32, batch_size=batch_size)
-        final_outputs, final_state, _ = my_decoder(
-            decoder_inputs,
-            initial_state=initial_state,
-            sequence_length=decoder_sequence_length,
-        )
+    sampler = sampler_py.TrainingSampler()
+    my_decoder = basic_decoder.BasicDecoder(cell=cell, sampler=sampler)
+    initial_state = cell.get_initial_state(dtype=tf.float32, batch_size=batch_size)
+    final_outputs, final_state, _ = my_decoder(
+        decoder_inputs,
+        initial_state=initial_state,
+        sequence_length=decoder_sequence_length,
+    )
 
-        assert isinstance(final_outputs, basic_decoder.BasicDecoderOutput)
-        assert isinstance(final_state, wrapper.AttentionWrapperState)
+    assert isinstance(final_outputs, basic_decoder.BasicDecoderOutput)
+    assert isinstance(final_state, wrapper.AttentionWrapperState)
 
-        expected_time = max(decoder_sequence_length) if tf.executing_eagerly() else None
-        assert (batch_size, expected_time, attention_depth) == tuple(
-            final_outputs.rnn_output.get_shape().as_list()
-        )
-        assert (batch_size, expected_time) == tuple(
-            final_outputs.sample_id.get_shape().as_list()
-        )
+    expected_time = max(decoder_sequence_length) if tf.executing_eagerly() else None
+    assert (batch_size, expected_time, attention_depth) == tuple(
+        final_outputs.rnn_output.get_shape().as_list()
+    )
+    assert (batch_size, expected_time) == tuple(
+        final_outputs.sample_id.get_shape().as_list()
+    )
 
-        assert (batch_size, attention_depth) == tuple(
-            final_state.attention.get_shape().as_list()
-        )
-        assert (batch_size, cell_depth) == tuple(
-            final_state.cell_state[0].get_shape().as_list()
-        )
-        assert (batch_size, cell_depth) == tuple(
-            final_state.cell_state[1].get_shape().as_list()
-        )
+    assert (batch_size, attention_depth) == tuple(
+        final_state.attention.get_shape().as_list()
+    )
+    assert (batch_size, cell_depth) == tuple(
+        final_state.cell_state[0].get_shape().as_list()
+    )
+    assert (batch_size, cell_depth) == tuple(
+        final_state.cell_state[1].get_shape().as_list()
+    )
 
-        if alignment_history:
-            if is_multi:
-                state_alignment_history = []
-                for history_array in final_state.alignment_history:
-                    history = history_array.stack()
-                    assert (expected_time, batch_size, encoder_max_time) == tuple(
-                        history.get_shape().as_list()
-                    )
-                    state_alignment_history.append(history)
-                state_alignment_history = tuple(state_alignment_history)
-            else:
-                state_alignment_history = final_state.alignment_history.stack()
+    if alignment_history:
+        if is_multi:
+            state_alignment_history = []
+            for history_array in final_state.alignment_history:
+                history = history_array.stack()
                 assert (expected_time, batch_size, encoder_max_time) == tuple(
-                    state_alignment_history.get_shape().as_list()
+                    history.get_shape().as_list()
                 )
-            tf.nest.assert_same_structure(
-                cell.state_size,
-                cell.get_initial_state(batch_size=batch_size, dtype=tf.float32),
-            )
-            # Remove the history from final_state for purposes of the
-            # remainder of the tests.
-            final_state = final_state._replace(
-                alignment_history=()
-            )  # pylint: disable=protected-access
+                state_alignment_history.append(history)
+            state_alignment_history = tuple(state_alignment_history)
         else:
-            state_alignment_history = ()
-
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        eval_result = self.evaluate(
-            {
-                "final_outputs": final_outputs,
-                "final_state": final_state,
-                "state_alignment_history": state_alignment_history,
-            }
-        )
+            state_alignment_history = final_state.alignment_history.stack()
+            assert (expected_time, batch_size, encoder_max_time) == tuple(
+                state_alignment_history.get_shape().as_list()
+            )
+        tf.nest.assert_same_structure(
+            cell.state_size,
+            cell.get_initial_state(batch_size=batch_size, dtype=tf.float32),
+        )
+        # Remove the history from final_state for purposes of the
+        # remainder of the tests.
+        final_state = final_state._replace(
+            alignment_history=()
+        )  # pylint: disable=protected-access
+    else:
+        state_alignment_history = ()
+
+    self.evaluate(tf.compat.v1.global_variables_initializer())
+    eval_result = self.evaluate(
+        {
+            "final_outputs": final_outputs,
+            "final_state": final_state,
+            "state_alignment_history": state_alignment_history,
+        }
+    )
 
-        final_output_info = tf.nest.map_structure(
-            get_result_summary, eval_result["final_outputs"]
-        )
-        final_state_info = tf.nest.map_structure(
-            get_result_summary, eval_result["final_state"]
-        )
-        print("final_output_info: ", final_output_info)
-        print("final_state_info: ", final_state_info)
+    final_output_info = tf.nest.map_structure(
+        get_result_summary, eval_result["final_outputs"]
+    )
+    final_state_info = tf.nest.map_structure(
+        get_result_summary, eval_result["final_state"]
+    )
+    print("final_output_info: ", final_output_info)
+    print("final_state_info: ", final_state_info)
 
-        tf.nest.map_structure(
-            self.assertAllCloseOrEqual, expected_final_output, final_output_info
+    tf.nest.map_structure(
+        self.assertAllCloseOrEqual, expected_final_output, final_output_info
+    )
+    tf.nest.map_structure(
+        self.assertAllCloseOrEqual, expected_final_state, final_state_info
+    )
+    # by default, the wrapper emits attention as output
+    if alignment_history:
+        final_alignment_history_info = tf.nest.map_structure(
+            get_result_summary, eval_result["state_alignment_history"]
         )
+        print("final_alignment_history_info: ", final_alignment_history_info)
         tf.nest.map_structure(
-            self.assertAllCloseOrEqual, expected_final_state, final_state_info
+            self.assertAllCloseOrEqual,
+            # outputs are batch major but the stacked TensorArray is
+            # time major
+            expected_final_alignment_history,
+            final_alignment_history_info,
         )
-        # by default, the wrapper emits attention as output
-        if alignment_history:
-            final_alignment_history_info = tf.nest.map_structure(
-                get_result_summary, eval_result["state_alignment_history"]
-            )
-            print("final_alignment_history_info: ", final_alignment_history_info)
-            tf.nest.map_structure(
-                self.assertAllCloseOrEqual,
-                # outputs are batch major but the stacked TensorArray is
-                # time major
-                expected_final_alignment_history,
-                final_alignment_history_info,
-            )
 
 
 @test_utils.run_all_in_graph_and_eager_modes

From 2c6033e00497896369402d963579bab5daad96e4 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 09:02:29 +0000
Subject: [PATCH 09/40] Remove prints.

---
 tensorflow_addons/seq2seq/tests/attention_wrapper_test.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index 4135ecb60d..326ce08c4e 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -482,8 +482,6 @@ def _test_with_attention(
     final_state_info = tf.nest.map_structure(
         get_result_summary, eval_result["final_state"]
     )
-    print("final_output_info: ", final_output_info)
-    print("final_state_info: ", final_state_info)
 
     tf.nest.map_structure(
         self.assertAllCloseOrEqual, expected_final_output, final_output_info
@@ -496,7 +494,6 @@ def _test_with_attention(
         final_alignment_history_info = tf.nest.map_structure(
             get_result_summary, eval_result["state_alignment_history"]
         )
-        print("final_alignment_history_info: ", final_alignment_history_info)
         tf.nest.map_structure(
             self.assertAllCloseOrEqual,
             # outputs are batch major but the stacked TensorArray is

From 6eeb7b7bd996f513e88f0851ac635a5ff71fcba4 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 09:04:22 +0000
Subject: [PATCH 10/40] Removed one assertAllclose.

---
 tensorflow_addons/seq2seq/tests/attention_wrapper_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index 326ce08c4e..22bc7a439b 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -484,7 +484,7 @@ def _test_with_attention(
     )
 
     tf.nest.map_structure(
-        self.assertAllCloseOrEqual, expected_final_output, final_output_info
+        assert_allclose_or_equal, expected_final_output, final_output_info
     )
     tf.nest.map_structure(
         self.assertAllCloseOrEqual, expected_final_state, final_state_info

From 54c61a9da240b53f2325eb3c2bf9b7d20f664150 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 09:05:30 +0000
Subject: [PATCH 11/40] Removed some more self.

---
 tensorflow_addons/seq2seq/tests/attention_wrapper_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index 22bc7a439b..daca5e3aff 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -487,7 +487,7 @@ def _test_with_attention(
         assert_allclose_or_equal, expected_final_output, final_output_info
     )
     tf.nest.map_structure(
-        self.assertAllCloseOrEqual, expected_final_state, final_state_info
+        assert_allclose_or_equal, expected_final_state, final_state_info
     )
     # by default, the wrapper emits attention as output
     if alignment_history:
@@ -495,7 +495,7 @@ def _test_with_attention(
             get_result_summary, eval_result["state_alignment_history"]
         )
         tf.nest.map_structure(
-            self.assertAllCloseOrEqual,
+            assert_allclose_or_equal,
             # outputs are batch major but the stacked TensorArray is
             # time major
             expected_final_alignment_history,

From 7c9d153ccf83645097e534b2f0185c7bb3a83a17 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 09:07:26 +0000
Subject: [PATCH 12/40] Removed some more self.

---
 .../seq2seq/tests/attention_wrapper_test.py     | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index daca5e3aff..9c812ecbef 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -533,17 +533,18 @@ def setUp(self):
 
     @parameterized.parameters([np.float32, np.float64])
     def testBahdanauNormalizedDType(self, dtype):
-        encoder_outputs = self.encoder_outputs.astype(dtype)
-        decoder_inputs = self.decoder_inputs.astype(dtype)
+        dummy_data = DummyData2()
+        encoder_outputs = dummy_data.encoder_outputs.astype(dtype)
+        decoder_inputs = dummy_data.decoder_inputs.astype(dtype)
         attention_mechanism = wrapper.BahdanauAttention(
-            units=self.units,
+            units=dummy_data.units,
             memory=encoder_outputs,
-            memory_sequence_length=self.encoder_sequence_length,
+            memory_sequence_length=dummy_data.encoder_sequence_length,
             normalize=True,
             dtype=dtype,
         )
         cell = tf.keras.layers.LSTMCell(
-            self.units, recurrent_activation="sigmoid", dtype=dtype
+            dummy_data.units, recurrent_activation="sigmoid", dtype=dtype
         )
         cell = wrapper.AttentionWrapper(cell, attention_mechanism, dtype=dtype)
 
@@ -552,8 +553,10 @@ def testBahdanauNormalizedDType(self, dtype):
 
         final_outputs, final_state, _ = my_decoder(
             decoder_inputs,
-            initial_state=cell.get_initial_state(batch_size=self.batch, dtype=dtype),
-            sequence_length=self.decoder_sequence_length,
+            initial_state=cell.get_initial_state(
+                batch_size=dummy_data.batch, dtype=dtype
+            ),
+            sequence_length=dummy_data.decoder_sequence_length,
         )
         assert isinstance(final_outputs, basic_decoder.BasicDecoderOutput)
         assert final_outputs.rnn_output.dtype == dtype

From 0e680de9034e4bf03f3ecf61d7d22b73bc87d5a5 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 09:09:02 +0000
Subject: [PATCH 13/40] Moved function out of class.

---
 .../seq2seq/tests/attention_wrapper_test.py   | 61 +++++++++----------
 1 file changed, 30 insertions(+), 31 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index 9c812ecbef..964ab1ccd5 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -503,6 +503,36 @@ def _test_with_attention(
         )
 
 
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_bahdanau_normalized_dtype(dtype):
+    dummy_data = DummyData2()
+    encoder_outputs = dummy_data.encoder_outputs.astype(dtype)
+    decoder_inputs = dummy_data.decoder_inputs.astype(dtype)
+    attention_mechanism = wrapper.BahdanauAttention(
+        units=dummy_data.units,
+        memory=encoder_outputs,
+        memory_sequence_length=dummy_data.encoder_sequence_length,
+        normalize=True,
+        dtype=dtype,
+    )
+    cell = tf.keras.layers.LSTMCell(
+        dummy_data.units, recurrent_activation="sigmoid", dtype=dtype
+    )
+    cell = wrapper.AttentionWrapper(cell, attention_mechanism, dtype=dtype)
+
+    sampler = sampler_py.TrainingSampler()
+    my_decoder = basic_decoder.BasicDecoder(cell=cell, sampler=sampler, dtype=dtype)
+
+    final_outputs, final_state, _ = my_decoder(
+        decoder_inputs,
+        initial_state=cell.get_initial_state(batch_size=dummy_data.batch, dtype=dtype),
+        sequence_length=dummy_data.decoder_sequence_length,
+    )
+    assert isinstance(final_outputs, basic_decoder.BasicDecoderOutput)
+    assert final_outputs.rnn_output.dtype == dtype
+    assert isinstance(final_state, wrapper.AttentionWrapperState)
+
+
 @test_utils.run_all_in_graph_and_eager_modes
 class AttentionWrapperTest(tf.test.TestCase, parameterized.TestCase):
     def assertAllCloseOrEqual(self, x, y, **kwargs):
@@ -531,37 +561,6 @@ def setUp(self):
             self.decoder_timestep, size=(self.batch,)
         ).astype(np.int32)
 
-    @parameterized.parameters([np.float32, np.float64])
-    def testBahdanauNormalizedDType(self, dtype):
-        dummy_data = DummyData2()
-        encoder_outputs = dummy_data.encoder_outputs.astype(dtype)
-        decoder_inputs = dummy_data.decoder_inputs.astype(dtype)
-        attention_mechanism = wrapper.BahdanauAttention(
-            units=dummy_data.units,
-            memory=encoder_outputs,
-            memory_sequence_length=dummy_data.encoder_sequence_length,
-            normalize=True,
-            dtype=dtype,
-        )
-        cell = tf.keras.layers.LSTMCell(
-            dummy_data.units, recurrent_activation="sigmoid", dtype=dtype
-        )
-        cell = wrapper.AttentionWrapper(cell, attention_mechanism, dtype=dtype)
-
-        sampler = sampler_py.TrainingSampler()
-        my_decoder = basic_decoder.BasicDecoder(cell=cell, sampler=sampler, dtype=dtype)
-
-        final_outputs, final_state, _ = my_decoder(
-            decoder_inputs,
-            initial_state=cell.get_initial_state(
-                batch_size=dummy_data.batch, dtype=dtype
-            ),
-            sequence_length=dummy_data.decoder_sequence_length,
-        )
-        assert isinstance(final_outputs, basic_decoder.BasicDecoderOutput)
-        assert final_outputs.rnn_output.dtype == dtype
-        assert isinstance(final_state, wrapper.AttentionWrapperState)
-
     @parameterized.parameters([np.float32, np.float64])
     def testLuongScaledDType(self, dtype):
         # Test case for GitHub issue 18099

From b59e6dbb770ecaba073b116780d1cc19fbe0d344 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 09:10:39 +0000
Subject: [PATCH 14/40] Removed one function.

---
 tensorflow_addons/seq2seq/tests/attention_wrapper_test.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index 964ab1ccd5..a07ae2b608 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -535,12 +535,6 @@ def test_bahdanau_normalized_dtype(dtype):
 
 @test_utils.run_all_in_graph_and_eager_modes
 class AttentionWrapperTest(tf.test.TestCase, parameterized.TestCase):
-    def assertAllCloseOrEqual(self, x, y, **kwargs):
-        if isinstance(x, np.ndarray) or isinstance(x, float):
-            return super().assertAllClose(x, y, atol=1e-3, **kwargs)
-        else:
-            self.assertAllEqual(x, y, **kwargs)
-
     def setUp(self):
         super().setUp()
         self.batch = 64

From 45370cdc9d62d46aef55d5fb70ef79de8132c058 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 09:16:00 +0000
Subject: [PATCH 15/40] Removed some self.

---
 .../seq2seq/tests/attention_wrapper_test.py     | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index a07ae2b608..b6ca17d723 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -557,18 +557,19 @@ def setUp(self):
 
     @parameterized.parameters([np.float32, np.float64])
     def testLuongScaledDType(self, dtype):
+        dummy_data = DummyData2()
         # Test case for GitHub issue 18099
-        encoder_outputs = self.encoder_outputs.astype(dtype)
-        decoder_inputs = self.decoder_inputs.astype(dtype)
+        encoder_outputs = dummy_data.encoder_outputs.astype(dtype)
+        decoder_inputs = dummy_data.decoder_inputs.astype(dtype)
         attention_mechanism = wrapper.LuongAttention(
-            units=self.units,
+            units=dummy_data.units,
             memory=encoder_outputs,
-            memory_sequence_length=self.encoder_sequence_length,
+            memory_sequence_length=dummy_data.encoder_sequence_length,
             scale=True,
             dtype=dtype,
         )
         cell = tf.keras.layers.LSTMCell(
-            self.units, recurrent_activation="sigmoid", dtype=dtype
+            dummy_data.units, recurrent_activation="sigmoid", dtype=dtype
         )
         cell = wrapper.AttentionWrapper(cell, attention_mechanism, dtype=dtype)
 
@@ -577,8 +578,10 @@ def testLuongScaledDType(self, dtype):
 
         final_outputs, final_state, _ = my_decoder(
             decoder_inputs,
-            initial_state=cell.get_initial_state(batch_size=self.batch, dtype=dtype),
-            sequence_length=self.decoder_sequence_length,
+            initial_state=cell.get_initial_state(
+                batch_size=dummy_data.batch, dtype=dtype
+            ),
+            sequence_length=dummy_data.decoder_sequence_length,
         )
         assert isinstance(final_outputs, basic_decoder.BasicDecoderOutput)
         assert final_outputs.rnn_output.dtype == dtype

From 6f41721b57493c8a94b0291739b86c557b8b4973 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 09:17:35 +0000
Subject: [PATCH 16/40] Moved a function out of tf.test.TestCase.

---
 .../seq2seq/tests/attention_wrapper_test.py   | 63 +++++++++----------
 1 file changed, 31 insertions(+), 32 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index b6ca17d723..708cb3d2f1 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -533,6 +533,37 @@ def test_bahdanau_normalized_dtype(dtype):
     assert isinstance(final_state, wrapper.AttentionWrapperState)
 
 
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_luong_scaled_dtype(dtype):
+    dummy_data = DummyData2()
+    # Test case for GitHub issue 18099
+    encoder_outputs = dummy_data.encoder_outputs.astype(dtype)
+    decoder_inputs = dummy_data.decoder_inputs.astype(dtype)
+    attention_mechanism = wrapper.LuongAttention(
+        units=dummy_data.units,
+        memory=encoder_outputs,
+        memory_sequence_length=dummy_data.encoder_sequence_length,
+        scale=True,
+        dtype=dtype,
+    )
+    cell = tf.keras.layers.LSTMCell(
+        dummy_data.units, recurrent_activation="sigmoid", dtype=dtype
+    )
+    cell = wrapper.AttentionWrapper(cell, attention_mechanism, dtype=dtype)
+
+    sampler = sampler_py.TrainingSampler()
+    my_decoder = basic_decoder.BasicDecoder(cell=cell, sampler=sampler, dtype=dtype)
+
+    final_outputs, final_state, _ = my_decoder(
+        decoder_inputs,
+        initial_state=cell.get_initial_state(batch_size=dummy_data.batch, dtype=dtype),
+        sequence_length=dummy_data.decoder_sequence_length,
+    )
+    assert isinstance(final_outputs, basic_decoder.BasicDecoderOutput)
+    assert final_outputs.rnn_output.dtype == dtype
+    assert isinstance(final_state, wrapper.AttentionWrapperState)
+
+
 @test_utils.run_all_in_graph_and_eager_modes
 class AttentionWrapperTest(tf.test.TestCase, parameterized.TestCase):
     def setUp(self):
@@ -555,38 +586,6 @@ def setUp(self):
             self.decoder_timestep, size=(self.batch,)
         ).astype(np.int32)
 
-    @parameterized.parameters([np.float32, np.float64])
-    def testLuongScaledDType(self, dtype):
-        dummy_data = DummyData2()
-        # Test case for GitHub issue 18099
-        encoder_outputs = dummy_data.encoder_outputs.astype(dtype)
-        decoder_inputs = dummy_data.decoder_inputs.astype(dtype)
-        attention_mechanism = wrapper.LuongAttention(
-            units=dummy_data.units,
-            memory=encoder_outputs,
-            memory_sequence_length=dummy_data.encoder_sequence_length,
-            scale=True,
-            dtype=dtype,
-        )
-        cell = tf.keras.layers.LSTMCell(
-            dummy_data.units, recurrent_activation="sigmoid", dtype=dtype
-        )
-        cell = wrapper.AttentionWrapper(cell, attention_mechanism, dtype=dtype)
-
-        sampler = sampler_py.TrainingSampler()
-        my_decoder = basic_decoder.BasicDecoder(cell=cell, sampler=sampler, dtype=dtype)
-
-        final_outputs, final_state, _ = my_decoder(
-            decoder_inputs,
-            initial_state=cell.get_initial_state(
-                batch_size=dummy_data.batch, dtype=dtype
-            ),
-            sequence_length=dummy_data.decoder_sequence_length,
-        )
-        assert isinstance(final_outputs, basic_decoder.BasicDecoderOutput)
-        assert final_outputs.rnn_output.dtype == dtype
-        assert isinstance(final_state, wrapper.AttentionWrapperState)
-
     def testBahdanauNotNormalized(self):
         create_attention_mechanism = wrapper.BahdanauAttention
         create_attention_kwargs = {"kernel_initializer": "ones"}

From 660a5c2ad7db892cd4bb9aebefeebd04b22c1a13 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 09:21:51 +0000
Subject: [PATCH 17/40] Removed decorator.

---
 tensorflow_addons/seq2seq/tests/attention_wrapper_test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index 708cb3d2f1..5f8be3b957 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -564,7 +564,6 @@ def test_luong_scaled_dtype(dtype):
     assert isinstance(final_state, wrapper.AttentionWrapperState)
 
 
-@test_utils.run_all_in_graph_and_eager_modes
 class AttentionWrapperTest(tf.test.TestCase, parameterized.TestCase):
     def setUp(self):
         super().setUp()

From 6d5962ecd46fb6e187ae5714c6c8444de9d1c4d5 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 09:26:53 +0000
Subject: [PATCH 18/40] IIII

---
 .../seq2seq/tests/attention_wrapper_test.py              | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index 5f8be3b957..15f3b56087 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -586,6 +586,7 @@ def setUp(self):
         ).astype(np.int32)
 
     def testBahdanauNotNormalized(self):
+        tf.config.experimental_run_functions_eagerly(True)
         create_attention_mechanism = wrapper.BahdanauAttention
         create_attention_kwargs = {"kernel_initializer": "ones"}
         expected_final_output = basic_decoder.BasicDecoderOutput(
@@ -630,6 +631,7 @@ def testBahdanauNotNormalized(self):
         )
 
     def testBahdanauNormalized(self):
+        tf.config.experimental_run_functions_eagerly(True)
         create_attention_mechanism = wrapper.BahdanauAttention
         create_attention_kwargs = {"kernel_initializer": "ones", "normalize": True}
 
@@ -666,6 +668,7 @@ def testBahdanauNormalized(self):
         )
 
     def testLuongNotNormalized(self):
+        tf.config.experimental_run_functions_eagerly(True)
         create_attention_mechanism = wrapper.LuongAttention
 
         expected_final_output = basic_decoder.BasicDecoderOutput(
@@ -702,6 +705,7 @@ def testLuongNotNormalized(self):
         )
 
     def testLuongScaled(self):
+        tf.config.experimental_run_functions_eagerly(True)
         create_attention_mechanism = wrapper.LuongAttention
         create_attention_kwargs = {"scale": True}
 
@@ -740,6 +744,7 @@ def testLuongScaled(self):
         )
 
     def testNotUseAttentionLayer(self):
+        tf.config.experimental_run_functions_eagerly(True)
         create_attention_mechanism = wrapper.BahdanauAttention
         create_attention_kwargs = {"kernel_initializer": "ones"}
 
@@ -777,6 +782,7 @@ def testNotUseAttentionLayer(self):
         )
 
     def testBahdanauMonotonicNotNormalized(self):
+        tf.config.experimental_run_functions_eagerly(True)
         create_attention_mechanism = wrapper.BahdanauMonotonicAttention
         create_attention_kwargs = {"kernel_initializer": "ones"}
 
@@ -820,6 +826,7 @@ def testBahdanauMonotonicNotNormalized(self):
         )
 
     def testBahdanauMonotonicNormalized(self):
+        tf.config.experimental_run_functions_eagerly(True)
         create_attention_mechanism = wrapper.BahdanauMonotonicAttention
         create_attention_kwargs = {"kernel_initializer": "ones", "normalize": True}
         expected_final_output = basic_decoder.BasicDecoderOutput(
@@ -862,6 +869,7 @@ def testBahdanauMonotonicNormalized(self):
         )
 
     def testLuongMonotonicNotNormalized(self):
+        tf.config.experimental_run_functions_eagerly(True)
         create_attention_mechanism = wrapper.LuongMonotonicAttention
 
         expected_final_output = basic_decoder.BasicDecoderOutput(
@@ -903,6 +911,7 @@ def testLuongMonotonicNotNormalized(self):
         )
 
     def testLuongMonotonicScaled(self):
+        tf.config.experimental_run_functions_eagerly(True)
         create_attention_mechanism = wrapper.LuongMonotonicAttention
         create_attention_kwargs = {"scale": True}
 

From f83718a83bd5ad6ffd0a21bb500ec0b43319bce1 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 09:36:25 +0000
Subject: [PATCH 19/40] Removed the run functions eagerly.

---
 .../seq2seq/tests/attention_wrapper_test.py              | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index 15f3b56087..5f8be3b957 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -586,7 +586,6 @@ def setUp(self):
         ).astype(np.int32)
 
     def testBahdanauNotNormalized(self):
-        tf.config.experimental_run_functions_eagerly(True)
         create_attention_mechanism = wrapper.BahdanauAttention
         create_attention_kwargs = {"kernel_initializer": "ones"}
         expected_final_output = basic_decoder.BasicDecoderOutput(
@@ -631,7 +630,6 @@ def testBahdanauNotNormalized(self):
         )
 
     def testBahdanauNormalized(self):
-        tf.config.experimental_run_functions_eagerly(True)
         create_attention_mechanism = wrapper.BahdanauAttention
         create_attention_kwargs = {"kernel_initializer": "ones", "normalize": True}
 
@@ -668,7 +666,6 @@ def testBahdanauNormalized(self):
         )
 
     def testLuongNotNormalized(self):
-        tf.config.experimental_run_functions_eagerly(True)
         create_attention_mechanism = wrapper.LuongAttention
 
         expected_final_output = basic_decoder.BasicDecoderOutput(
@@ -705,7 +702,6 @@ def testLuongNotNormalized(self):
         )
 
     def testLuongScaled(self):
-        tf.config.experimental_run_functions_eagerly(True)
         create_attention_mechanism = wrapper.LuongAttention
         create_attention_kwargs = {"scale": True}
 
@@ -744,7 +740,6 @@ def testLuongScaled(self):
         )
 
     def testNotUseAttentionLayer(self):
-        tf.config.experimental_run_functions_eagerly(True)
         create_attention_mechanism = wrapper.BahdanauAttention
         create_attention_kwargs = {"kernel_initializer": "ones"}
 
@@ -782,7 +777,6 @@ def testNotUseAttentionLayer(self):
         )
 
     def testBahdanauMonotonicNotNormalized(self):
-        tf.config.experimental_run_functions_eagerly(True)
         create_attention_mechanism = wrapper.BahdanauMonotonicAttention
         create_attention_kwargs = {"kernel_initializer": "ones"}
 
@@ -826,7 +820,6 @@ def testBahdanauMonotonicNotNormalized(self):
         )
 
     def testBahdanauMonotonicNormalized(self):
-        tf.config.experimental_run_functions_eagerly(True)
         create_attention_mechanism = wrapper.BahdanauMonotonicAttention
         create_attention_kwargs = {"kernel_initializer": "ones", "normalize": True}
         expected_final_output = basic_decoder.BasicDecoderOutput(
@@ -869,7 +862,6 @@ def testBahdanauMonotonicNormalized(self):
         )
 
     def testLuongMonotonicNotNormalized(self):
-        tf.config.experimental_run_functions_eagerly(True)
         create_attention_mechanism = wrapper.LuongMonotonicAttention
 
         expected_final_output = basic_decoder.BasicDecoderOutput(
@@ -911,7 +903,6 @@ def testLuongMonotonicNotNormalized(self):
         )
 
     def testLuongMonotonicScaled(self):
-        tf.config.experimental_run_functions_eagerly(True)
         create_attention_mechanism = wrapper.LuongMonotonicAttention
         create_attention_kwargs = {"scale": True}
 

From bbb9f5535e6d3d4d09fa430da18bf9f15cd2b793 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 09:38:14 +0000
Subject: [PATCH 20/40] Removed import.

---
 tensorflow_addons/seq2seq/tests/attention_wrapper_test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index 5f8be3b957..3dad5008e5 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -21,7 +21,6 @@
 import numpy as np
 import tensorflow as tf
 
-from tensorflow_addons.utils import test_utils
 from tensorflow_addons.seq2seq import attention_wrapper as wrapper
 from tensorflow_addons.seq2seq import basic_decoder
 from tensorflow_addons.seq2seq import sampler as sampler_py

From 50ab13170f49f1d1d3a0803a5e0691b07707df5e Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 11:18:20 +0000
Subject: [PATCH 21/40] Removed is_multi.

---
 .../seq2seq/tests/attention_wrapper_test.py   | 34 ++++++-------------
 1 file changed, 10 insertions(+), 24 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index 3dad5008e5..0790a756ef 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -320,10 +320,7 @@ def _test_with_attention(
     attention_layers = [attention_layer] if attention_layer is not None else None
     create_attention_mechanisms = [create_attention_mechanism]
     attention_mechanism_depths = [attention_mechanism_depth]
-    is_multi = False
-    # Allow is_multi to be True with a single mechanism to enable test for
-    # passing in a single mechanism in a list.
-    assert len(create_attention_mechanisms) == 1 or is_multi
+    assert len(create_attention_mechanisms) == 1
     encoder_sequence_length = [3, 2, 3, 1, 1]
     decoder_sequence_length = [2, 0, 1, 2, 3]
     batch_size = 5
@@ -385,11 +382,10 @@ def _test_with_attention(
 
     attention_layer_size = attention_layer_sizes
     attention_layer = attention_layers
-    if not is_multi:
-        if attention_layer_size is not None:
-            attention_layer_size = attention_layer_size[0]
-        if attention_layer is not None:
-            attention_layer = attention_layer[0]
+    if attention_layer_size is not None:
+        attention_layer_size = attention_layer_size[0]
+    if attention_layer is not None:
+        attention_layer = attention_layer[0]
     cell = tf.keras.layers.LSTMCell(
         cell_depth,
         recurrent_activation="sigmoid",
@@ -398,7 +394,7 @@ def _test_with_attention(
     )
     cell = wrapper.AttentionWrapper(
         cell,
-        attention_mechanisms if is_multi else attention_mechanisms[0],
+        attention_mechanisms[0],
         attention_layer_size=attention_layer_size,
         alignment_history=alignment_history,
         attention_layer=attention_layer,
@@ -440,20 +436,10 @@ def _test_with_attention(
     )
 
     if alignment_history:
-        if is_multi:
-            state_alignment_history = []
-            for history_array in final_state.alignment_history:
-                history = history_array.stack()
-                assert (expected_time, batch_size, encoder_max_time) == tuple(
-                    history.get_shape().as_list()
-                )
-                state_alignment_history.append(history)
-            state_alignment_history = tuple(state_alignment_history)
-        else:
-            state_alignment_history = final_state.alignment_history.stack()
-            assert (expected_time, batch_size, encoder_max_time) == tuple(
-                state_alignment_history.get_shape().as_list()
-            )
+        state_alignment_history = final_state.alignment_history.stack()
+        assert (expected_time, batch_size, encoder_max_time) == tuple(
+            state_alignment_history.get_shape().as_list()
+        )
         tf.nest.assert_same_structure(
             cell.state_size,
             cell.get_initial_state(batch_size=batch_size, dtype=tf.float32),

From b80de7b8fe5c7b8d87d678de682cddd61ba1371e Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 11:20:10 +0000
Subject: [PATCH 22/40] Removed self for batch.

---
 .../seq2seq/tests/attention_wrapper_test.py          | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index 0790a756ef..e927047ee5 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -552,22 +552,20 @@ def test_luong_scaled_dtype(dtype):
 class AttentionWrapperTest(tf.test.TestCase, parameterized.TestCase):
     def setUp(self):
         super().setUp()
-        self.batch = 64
+        batch = 64
         self.units = 128
         self.encoder_timestep = 10
         self.encoder_dim = 256
         self.decoder_timestep = 12
         self.encoder_outputs = np.random.randn(
-            self.batch, self.encoder_timestep, self.encoder_dim
+            batch, self.encoder_timestep, self.encoder_dim
         )
         self.encoder_sequence_length = np.random.randint(
-            1, high=self.encoder_timestep, size=(self.batch,)
+            1, high=self.encoder_timestep, size=(batch,)
         ).astype(np.int32)
-        self.decoder_inputs = np.random.randn(
-            self.batch, self.decoder_timestep, self.units
-        )
+        self.decoder_inputs = np.random.randn(batch, self.decoder_timestep, self.units)
         self.decoder_sequence_length = np.random.randint(
-            self.decoder_timestep, size=(self.batch,)
+            self.decoder_timestep, size=(batch,)
         ).astype(np.int32)
 
     def testBahdanauNotNormalized(self):

From 5dfc86aad84e82051e134877bfecc7a722d3a3e1 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 11:24:22 +0000
Subject: [PATCH 23/40] Removed some more self.

---
 tensorflow_addons/seq2seq/tests/attention_wrapper_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index e927047ee5..a7ba68bde6 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -553,7 +553,7 @@ class AttentionWrapperTest(tf.test.TestCase, parameterized.TestCase):
     def setUp(self):
         super().setUp()
         batch = 64
-        self.units = 128
+        units = 128
         self.encoder_timestep = 10
         self.encoder_dim = 256
         self.decoder_timestep = 12
@@ -563,7 +563,7 @@ def setUp(self):
         self.encoder_sequence_length = np.random.randint(
             1, high=self.encoder_timestep, size=(batch,)
         ).astype(np.int32)
-        self.decoder_inputs = np.random.randn(batch, self.decoder_timestep, self.units)
+        self.decoder_inputs = np.random.randn(batch, self.decoder_timestep, units)
         self.decoder_sequence_length = np.random.randint(
             self.decoder_timestep, size=(batch,)
         ).astype(np.int32)

From 56a346023321ec01f8e03361f452854bb14d3919 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 11:25:54 +0000
Subject: [PATCH 24/40] Removed some more self.

---
 .../seq2seq/tests/attention_wrapper_test.py      | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index a7ba68bde6..46c992ccad 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -554,18 +554,16 @@ def setUp(self):
         super().setUp()
         batch = 64
         units = 128
-        self.encoder_timestep = 10
-        self.encoder_dim = 256
-        self.decoder_timestep = 12
-        self.encoder_outputs = np.random.randn(
-            batch, self.encoder_timestep, self.encoder_dim
-        )
+        encoder_timestep = 10
+        encoder_dim = 256
+        decoder_timestep = 12
+        self.encoder_outputs = np.random.randn(batch, encoder_timestep, encoder_dim)
         self.encoder_sequence_length = np.random.randint(
-            1, high=self.encoder_timestep, size=(batch,)
+            1, high=encoder_timestep, size=(batch,)
         ).astype(np.int32)
-        self.decoder_inputs = np.random.randn(batch, self.decoder_timestep, units)
+        self.decoder_inputs = np.random.randn(batch, decoder_timestep, units)
         self.decoder_sequence_length = np.random.randint(
-            self.decoder_timestep, size=(batch,)
+            decoder_timestep, size=(batch,)
         ).astype(np.int32)
 
     def testBahdanauNotNormalized(self):

From 50b4a0c6a5b98710ea39763e418f64c3406034e6 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 11:28:42 +0000
Subject: [PATCH 25/40] Removed some stuff.

---
 .../seq2seq/tests/attention_wrapper_test.py          | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index 46c992ccad..0737fa3eb6 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -557,14 +557,10 @@ def setUp(self):
         encoder_timestep = 10
         encoder_dim = 256
         decoder_timestep = 12
-        self.encoder_outputs = np.random.randn(batch, encoder_timestep, encoder_dim)
-        self.encoder_sequence_length = np.random.randint(
-            1, high=encoder_timestep, size=(batch,)
-        ).astype(np.int32)
-        self.decoder_inputs = np.random.randn(batch, decoder_timestep, units)
-        self.decoder_sequence_length = np.random.randint(
-            decoder_timestep, size=(batch,)
-        ).astype(np.int32)
+        np.random.randn(batch, encoder_timestep, encoder_dim)
+        np.random.randint(1, high=encoder_timestep, size=(batch,)).astype(np.int32)
+        np.random.randn(batch, decoder_timestep, units)
+        np.random.randint(decoder_timestep, size=(batch,)).astype(np.int32)
 
     def testBahdanauNotNormalized(self):
         create_attention_mechanism = wrapper.BahdanauAttention

From 9aeabec90bd7642a8a5ab0a287238b8694fc3fb9 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 11:32:31 +0000
Subject: [PATCH 26/40] Unholy stuff there.

---
 .../seq2seq/tests/attention_wrapper_test.py   | 34 ++++++++++++-------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index 0737fa3eb6..10adedf1cf 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -549,20 +549,22 @@ def test_luong_scaled_dtype(dtype):
     assert isinstance(final_state, wrapper.AttentionWrapperState)
 
 
-class AttentionWrapperTest(tf.test.TestCase, parameterized.TestCase):
-    def setUp(self):
-        super().setUp()
-        batch = 64
-        units = 128
-        encoder_timestep = 10
-        encoder_dim = 256
-        decoder_timestep = 12
-        np.random.randn(batch, encoder_timestep, encoder_dim)
-        np.random.randint(1, high=encoder_timestep, size=(batch,)).astype(np.int32)
-        np.random.randn(batch, decoder_timestep, units)
-        np.random.randint(decoder_timestep, size=(batch,)).astype(np.int32)
+def do_some_stuff():
+    np.random.seed(87654321)
+    batch = 64
+    units = 128
+    encoder_timestep = 10
+    encoder_dim = 256
+    decoder_timestep = 12
+    np.random.randn(batch, encoder_timestep, encoder_dim)
+    np.random.randint(1, high=encoder_timestep, size=(batch,)).astype(np.int32)
+    np.random.randn(batch, decoder_timestep, units)
+    np.random.randint(decoder_timestep, size=(batch,)).astype(np.int32)
+
 
+class AttentionWrapperTest(tf.test.TestCase, parameterized.TestCase):
     def testBahdanauNotNormalized(self):
+        do_some_stuff()
         create_attention_mechanism = wrapper.BahdanauAttention
         create_attention_kwargs = {"kernel_initializer": "ones"}
         expected_final_output = basic_decoder.BasicDecoderOutput(
@@ -607,6 +609,7 @@ def testBahdanauNotNormalized(self):
         )
 
     def testBahdanauNormalized(self):
+        do_some_stuff()
         create_attention_mechanism = wrapper.BahdanauAttention
         create_attention_kwargs = {"kernel_initializer": "ones", "normalize": True}
 
@@ -643,6 +646,7 @@ def testBahdanauNormalized(self):
         )
 
     def testLuongNotNormalized(self):
+        do_some_stuff()
         create_attention_mechanism = wrapper.LuongAttention
 
         expected_final_output = basic_decoder.BasicDecoderOutput(
@@ -679,6 +683,7 @@ def testLuongNotNormalized(self):
         )
 
     def testLuongScaled(self):
+        do_some_stuff()
         create_attention_mechanism = wrapper.LuongAttention
         create_attention_kwargs = {"scale": True}
 
@@ -717,6 +722,7 @@ def testLuongScaled(self):
         )
 
     def testNotUseAttentionLayer(self):
+        do_some_stuff()
         create_attention_mechanism = wrapper.BahdanauAttention
         create_attention_kwargs = {"kernel_initializer": "ones"}
 
@@ -754,6 +760,7 @@ def testNotUseAttentionLayer(self):
         )
 
     def testBahdanauMonotonicNotNormalized(self):
+        do_some_stuff()
         create_attention_mechanism = wrapper.BahdanauMonotonicAttention
         create_attention_kwargs = {"kernel_initializer": "ones"}
 
@@ -797,6 +804,7 @@ def testBahdanauMonotonicNotNormalized(self):
         )
 
     def testBahdanauMonotonicNormalized(self):
+        do_some_stuff()
         create_attention_mechanism = wrapper.BahdanauMonotonicAttention
         create_attention_kwargs = {"kernel_initializer": "ones", "normalize": True}
         expected_final_output = basic_decoder.BasicDecoderOutput(
@@ -839,6 +847,7 @@ def testBahdanauMonotonicNormalized(self):
         )
 
     def testLuongMonotonicNotNormalized(self):
+        do_some_stuff()
         create_attention_mechanism = wrapper.LuongMonotonicAttention
 
         expected_final_output = basic_decoder.BasicDecoderOutput(
@@ -880,6 +889,7 @@ def testLuongMonotonicNotNormalized(self):
         )
 
     def testLuongMonotonicScaled(self):
+        do_some_stuff()
         create_attention_mechanism = wrapper.LuongMonotonicAttention
         create_attention_kwargs = {"scale": True}
 

From 92a0b971d6d0b715bfa2da1ff9348be4277388ce Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 11:43:21 +0000
Subject: [PATCH 27/40] Found a way to replace that.

---
 tensorflow_addons/seq2seq/tests/attention_wrapper_test.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index 10adedf1cf..5566bda9c0 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -455,11 +455,13 @@ def _test_with_attention(
     self.evaluate(tf.compat.v1.global_variables_initializer())
     eval_result = self.evaluate(
         {
-            "final_outputs": final_outputs,
             "final_state": final_state,
             "state_alignment_history": state_alignment_history,
         }
     )
+    eval_result["final_outputs"] = tf.nest.map_structure(
+        lambda x: x.numpy(), final_outputs
+    )
 
     final_output_info = tf.nest.map_structure(
         get_result_summary, eval_result["final_outputs"]

From 898af3ffed4443ea403c378af26f1eaa28108a36 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 11:44:35 +0000
Subject: [PATCH 28/40] Works well.

---
 tensorflow_addons/seq2seq/tests/attention_wrapper_test.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index 5566bda9c0..0bff6b6349 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -459,13 +459,9 @@ def _test_with_attention(
             "state_alignment_history": state_alignment_history,
         }
     )
-    eval_result["final_outputs"] = tf.nest.map_structure(
-        lambda x: x.numpy(), final_outputs
-    )
+    final_outputs = tf.nest.map_structure(lambda x: x.numpy(), final_outputs)
 
-    final_output_info = tf.nest.map_structure(
-        get_result_summary, eval_result["final_outputs"]
-    )
+    final_output_info = tf.nest.map_structure(get_result_summary, final_outputs)
     final_state_info = tf.nest.map_structure(
         get_result_summary, eval_result["final_state"]
     )

From ac412e074f727d72e7f7fb95f516c0856c51a164 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 11:46:19 +0000
Subject: [PATCH 29/40] Removed self completely from function.

---
 .../seq2seq/tests/attention_wrapper_test.py   | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index 0bff6b6349..b8d0a71a6a 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -452,19 +452,14 @@ def _test_with_attention(
     else:
         state_alignment_history = ()
 
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    eval_result = self.evaluate(
-        {
-            "final_state": final_state,
-            "state_alignment_history": state_alignment_history,
-        }
-    )
     final_outputs = tf.nest.map_structure(lambda x: x.numpy(), final_outputs)
-
-    final_output_info = tf.nest.map_structure(get_result_summary, final_outputs)
-    final_state_info = tf.nest.map_structure(
-        get_result_summary, eval_result["final_state"]
+    final_state = tf.nest.map_structure(lambda x: x.numpy(), final_state)
+    state_alignment_history = tf.nest.map_structure(
+        lambda x: x.numpy(), state_alignment_history
     )
+    final_output_info = tf.nest.map_structure(get_result_summary, final_outputs)
+
+    final_state_info = tf.nest.map_structure(get_result_summary, final_state)
 
     tf.nest.map_structure(
         assert_allclose_or_equal, expected_final_output, final_output_info
@@ -475,7 +470,7 @@ def _test_with_attention(
     # by default, the wrapper emits attention as output
     if alignment_history:
         final_alignment_history_info = tf.nest.map_structure(
-            get_result_summary, eval_result["state_alignment_history"]
+            get_result_summary, state_alignment_history
         )
         tf.nest.map_structure(
             assert_allclose_or_equal,

From 51ba1373c2db587d59033b78ab9169ed5678f98a Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 11:47:50 +0000
Subject: [PATCH 30/40] Removed self from parameters.

---
 .../seq2seq/tests/attention_wrapper_test.py            | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index b8d0a71a6a..9c24f9289a 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -301,7 +301,6 @@ def test_custom_attention_layer():
 
 
 def _test_with_attention(
-    self,
     create_attention_mechanism,
     expected_final_output,
     expected_final_state,
@@ -591,7 +590,6 @@ def testBahdanauNotNormalized(self):
         )
 
         _test_with_attention(
-            self,
             create_attention_mechanism,
             expected_final_output,
             expected_final_state,
@@ -630,7 +628,6 @@ def testBahdanauNormalized(self):
         )
 
         _test_with_attention(
-            self,
             create_attention_mechanism,
             expected_final_output,
             expected_final_state,
@@ -668,7 +665,6 @@ def testLuongNotNormalized(self):
         )
 
         _test_with_attention(
-            self,
             create_attention_mechanism,
             expected_final_output,
             expected_final_state,
@@ -706,7 +702,6 @@ def testLuongScaled(self):
         )
 
         _test_with_attention(
-            self,
             create_attention_mechanism,
             expected_final_output,
             expected_final_state,
@@ -743,7 +738,6 @@ def testNotUseAttentionLayer(self):
         )
 
         _test_with_attention(
-            self,
             create_attention_mechanism,
             expected_final_output,
             expected_final_state,
@@ -786,7 +780,6 @@ def testBahdanauMonotonicNotNormalized(self):
         )
 
         _test_with_attention(
-            self,
             create_attention_mechanism,
             expected_final_output,
             expected_final_state,
@@ -829,7 +822,6 @@ def testBahdanauMonotonicNormalized(self):
         )
 
         _test_with_attention(
-            self,
             create_attention_mechanism,
             expected_final_output,
             expected_final_state,
@@ -872,7 +864,6 @@ def testLuongMonotonicNotNormalized(self):
         )
 
         _test_with_attention(
-            self,
             create_attention_mechanism,
             expected_final_output,
             expected_final_state,
@@ -915,7 +906,6 @@ def testLuongMonotonicScaled(self):
         )
 
         _test_with_attention(
-            self,
             create_attention_mechanism,
             expected_final_output,
             expected_final_state,

From e2301f265957bd9bb4f6c662ad8ce43faee53a90 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 11:58:56 +0000
Subject: [PATCH 31/40] Managed to move the function.

---
 .../seq2seq/tests/attention_wrapper_test.py   | 82 +++++++++----------
 1 file changed, 39 insertions(+), 43 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index 9c24f9289a..d818d42e73 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -554,51 +554,47 @@ def do_some_stuff():
     np.random.randint(decoder_timestep, size=(batch,)).astype(np.int32)
 
 
-class AttentionWrapperTest(tf.test.TestCase, parameterized.TestCase):
-    def testBahdanauNotNormalized(self):
-        do_some_stuff()
-        create_attention_mechanism = wrapper.BahdanauAttention
-        create_attention_kwargs = {"kernel_initializer": "ones"}
-        expected_final_output = basic_decoder.BasicDecoderOutput(
-            rnn_output=ResultSummary(
-                shape=(5, 3, 6), dtype=np.dtype(np.float32), mean=-0.003204414
-            ),
-            sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype(np.int32), mean=3.2),
-        )
-        expected_final_state = wrapper.AttentionWrapperState(
-            cell_state=[
-                ResultSummary(
-                    shape=(5, 9), dtype=np.dtype(np.float32), mean=0.40868404
-                ),
-                ResultSummary(
-                    shape=(5, 9), dtype=np.dtype(np.float32), mean=0.89017969
-                ),
-            ],
-            attention=ResultSummary(
-                shape=(5, 6), dtype=np.dtype(np.float32), mean=0.041453815
-            ),
-            alignments=ResultSummary(
-                shape=(5, 8), dtype=np.dtype(np.float32), mean=0.125
-            ),
-            attention_state=ResultSummary(
-                shape=(5, 8), dtype=np.dtype(np.float32), mean=0.125
-            ),
-            alignment_history=(),
-        )
-        expected_final_alignment_history = ResultSummary(
-            shape=(3, 5, 8), dtype=np.dtype(np.float32), mean=0.125
-        )
+def test_bahdanau_not_normalized():
+    tf.random.set_seed(87654321)
+    do_some_stuff()
+    create_attention_mechanism = wrapper.BahdanauAttention
+    create_attention_kwargs = {"kernel_initializer": "ones"}
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype(np.float32), mean=-0.003204414
+        ),
+        sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype(np.int32), mean=3.2),
+    )
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=[
+            ResultSummary(shape=(5, 9), dtype=np.dtype(np.float32), mean=0.40868404),
+            ResultSummary(shape=(5, 9), dtype=np.dtype(np.float32), mean=0.89017969),
+        ],
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype(np.float32), mean=0.041453815
+        ),
+        alignments=ResultSummary(shape=(5, 8), dtype=np.dtype(np.float32), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype(np.float32), mean=0.125
+        ),
+        alignment_history=(),
+    )
+    expected_final_alignment_history = ResultSummary(
+        shape=(3, 5, 8), dtype=np.dtype(np.float32), mean=0.125
+    )
 
-        _test_with_attention(
-            create_attention_mechanism,
-            expected_final_output,
-            expected_final_state,
-            alignment_history=True,
-            create_query_layer=True,
-            expected_final_alignment_history=expected_final_alignment_history,
-            create_attention_kwargs=create_attention_kwargs,
-        )
+    _test_with_attention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        alignment_history=True,
+        create_query_layer=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        create_attention_kwargs=create_attention_kwargs,
+    )
 
+
+class AttentionWrapperTest(tf.test.TestCase, parameterized.TestCase):
     def testBahdanauNormalized(self):
         do_some_stuff()
         create_attention_mechanism = wrapper.BahdanauAttention

From 8797ab00cf82490999ddfd55442d96760d334031 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 12:05:55 +0000
Subject: [PATCH 32/40] It works with pytest only.

---
 .../seq2seq/tests/attention_wrapper_test.py   | 590 +++++++++---------
 1 file changed, 290 insertions(+), 300 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index d818d42e73..3c66638fce 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -554,9 +554,13 @@ def do_some_stuff():
     np.random.randint(decoder_timestep, size=(batch,)).astype(np.int32)
 
 
-def test_bahdanau_not_normalized():
+def set_random_state_for_tf_and_np():
     tf.random.set_seed(87654321)
     do_some_stuff()
+
+
+def test_bahdanau_not_normalized():
+    set_random_state_for_tf_and_np()
     create_attention_mechanism = wrapper.BahdanauAttention
     create_attention_kwargs = {"kernel_initializer": "ones"}
     expected_final_output = basic_decoder.BasicDecoderOutput(
@@ -594,322 +598,308 @@ def test_bahdanau_not_normalized():
     )
 
 
-class AttentionWrapperTest(tf.test.TestCase, parameterized.TestCase):
-    def testBahdanauNormalized(self):
-        do_some_stuff()
-        create_attention_mechanism = wrapper.BahdanauAttention
-        create_attention_kwargs = {"kernel_initializer": "ones", "normalize": True}
+def test_bahdanau_normalized():
+    set_random_state_for_tf_and_np()
+    create_attention_mechanism = wrapper.BahdanauAttention
+    create_attention_kwargs = {"kernel_initializer": "ones", "normalize": True}
 
-        expected_final_output = basic_decoder.BasicDecoderOutput(
-            rnn_output=ResultSummary(
-                shape=(5, 3, 6), dtype=np.dtype("float32"), mean=-0.008089137
-            ),
-            sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype("int32"), mean=2.8),
-        )
-        expected_final_state = wrapper.AttentionWrapperState(
-            cell_state=[
-                ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.49166861),
-                ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=1.01068615),
-            ],
-            attention=ResultSummary(
-                shape=(5, 6), dtype=np.dtype("float32"), mean=0.042427111
-            ),
-            alignments=ResultSummary(
-                shape=(5, 8), dtype=np.dtype("float32"), mean=0.125
-            ),
-            attention_state=ResultSummary(
-                shape=(5, 8), dtype=np.dtype("float32"), mean=0.125
-            ),
-            alignment_history=(),
-        )
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype("float32"), mean=-0.008089137
+        ),
+        sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype("int32"), mean=2.8),
+    )
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=[
+            ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.49166861),
+            ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=1.01068615),
+        ],
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype("float32"), mean=0.042427111
+        ),
+        alignments=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.125
+        ),
+        alignment_history=(),
+    )
 
-        _test_with_attention(
-            create_attention_mechanism,
-            expected_final_output,
-            expected_final_state,
-            create_query_layer=True,
-            create_attention_kwargs=create_attention_kwargs,
-        )
+    _test_with_attention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        create_query_layer=True,
+        create_attention_kwargs=create_attention_kwargs,
+    )
 
-    def testLuongNotNormalized(self):
-        do_some_stuff()
-        create_attention_mechanism = wrapper.LuongAttention
-
-        expected_final_output = basic_decoder.BasicDecoderOutput(
-            rnn_output=ResultSummary(
-                shape=(5, 3, 6), dtype=np.dtype("float32"), mean=-0.06124732
-            ),
-            sample_id=ResultSummary(
-                shape=(5, 3), dtype=np.dtype("int32"), mean=2.73333333
-            ),
-        )
-        expected_final_state = wrapper.AttentionWrapperState(
-            cell_state=[
-                ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.52021580),
-                ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=1.0964939),
-            ],
-            attention=ResultSummary(
-                shape=(5, 6), dtype=np.dtype("float32"), mean=-0.0318060
-            ),
-            alignments=ResultSummary(
-                shape=(5, 8), dtype=np.dtype("float32"), mean=0.125
-            ),
-            attention_state=ResultSummary(
-                shape=(5, 8), dtype=np.dtype("float32"), mean=0.125
-            ),
-            alignment_history=(),
-        )
 
-        _test_with_attention(
-            create_attention_mechanism,
-            expected_final_output,
-            expected_final_state,
-            attention_mechanism_depth=9,
-        )
+def test_luong_not_normalized():
+    set_random_state_for_tf_and_np()
+    create_attention_mechanism = wrapper.LuongAttention
 
-    def testLuongScaled(self):
-        do_some_stuff()
-        create_attention_mechanism = wrapper.LuongAttention
-        create_attention_kwargs = {"scale": True}
-
-        expected_final_output = basic_decoder.BasicDecoderOutput(
-            rnn_output=ResultSummary(
-                shape=(5, 3, 6), dtype=np.dtype("float32"), mean=-0.06124732
-            ),
-            sample_id=ResultSummary(
-                shape=(5, 3), dtype=np.dtype("int32"), mean=2.73333333
-            ),
-        )
-        expected_final_state = wrapper.AttentionWrapperState(
-            cell_state=[
-                ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.52021580),
-                ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=1.0964939),
-            ],
-            attention=ResultSummary(
-                shape=(5, 6), dtype=np.dtype("float32"), mean=-0.0318060
-            ),
-            alignments=ResultSummary(
-                shape=(5, 8), dtype=np.dtype("float32"), mean=0.125
-            ),
-            attention_state=ResultSummary(
-                shape=(5, 8), dtype=np.dtype("float32"), mean=0.125
-            ),
-            alignment_history=(),
-        )
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype("float32"), mean=-0.06124732
+        ),
+        sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype("int32"), mean=2.73333333),
+    )
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=[
+            ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.52021580),
+            ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=1.0964939),
+        ],
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype("float32"), mean=-0.0318060
+        ),
+        alignments=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.125
+        ),
+        alignment_history=(),
+    )
 
-        _test_with_attention(
-            create_attention_mechanism,
-            expected_final_output,
-            expected_final_state,
-            attention_mechanism_depth=9,
-            create_attention_kwargs=create_attention_kwargs,
-        )
+    _test_with_attention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        attention_mechanism_depth=9,
+    )
 
-    def testNotUseAttentionLayer(self):
-        do_some_stuff()
-        create_attention_mechanism = wrapper.BahdanauAttention
-        create_attention_kwargs = {"kernel_initializer": "ones"}
 
-        expected_final_output = basic_decoder.BasicDecoderOutput(
-            rnn_output=ResultSummary(
-                shape=(5, 3, 10), dtype=np.dtype("float32"), mean=0.078317143
-            ),
-            sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype("int32"), mean=4.2),
-        )
-        expected_final_state = wrapper.AttentionWrapperState(
-            cell_state=[
-                ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.89382392),
-                ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=1.722382),
-            ],
-            attention=ResultSummary(
-                shape=(5, 10), dtype=np.dtype("float32"), mean=0.026356646
-            ),
-            alignments=ResultSummary(
-                shape=(5, 8), dtype=np.dtype("float32"), mean=0.125
-            ),
-            attention_state=ResultSummary(
-                shape=(5, 8), dtype=np.dtype("float32"), mean=0.125
-            ),
-            alignment_history=(),
-        )
+def test_luong_scaled():
+    set_random_state_for_tf_and_np()
+    create_attention_mechanism = wrapper.LuongAttention
+    create_attention_kwargs = {"scale": True}
 
-        _test_with_attention(
-            create_attention_mechanism,
-            expected_final_output,
-            expected_final_state,
-            attention_layer_size=None,
-            create_query_layer=True,
-            create_attention_kwargs=create_attention_kwargs,
-        )
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype("float32"), mean=-0.06124732
+        ),
+        sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype("int32"), mean=2.73333333),
+    )
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=[
+            ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.52021580),
+            ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=1.0964939),
+        ],
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype("float32"), mean=-0.0318060
+        ),
+        alignments=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.125
+        ),
+        alignment_history=(),
+    )
 
-    def testBahdanauMonotonicNotNormalized(self):
-        do_some_stuff()
-        create_attention_mechanism = wrapper.BahdanauMonotonicAttention
-        create_attention_kwargs = {"kernel_initializer": "ones"}
-
-        expected_final_output = basic_decoder.BasicDecoderOutput(
-            rnn_output=ResultSummary(
-                shape=(5, 3, 6), dtype=np.dtype("float32"), mean=-0.009921653
-            ),
-            sample_id=ResultSummary(
-                shape=(5, 3), dtype=np.dtype("int32"), mean=3.13333333
-            ),
-        )
-        expected_final_state = wrapper.AttentionWrapperState(
-            cell_state=[
-                ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.44612807),
-                ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.95786464),
-            ],
-            attention=ResultSummary(
-                shape=(5, 6), dtype=np.dtype("float32"), mean=0.038682378
-            ),
-            alignments=ResultSummary(
-                shape=(5, 8), dtype=np.dtype("float32"), mean=0.09778417
-            ),
-            attention_state=ResultSummary(
-                shape=(5, 8), dtype=np.dtype("float32"), mean=0.09778417
-            ),
-            alignment_history=(),
-        )
-        expected_final_alignment_history = ResultSummary(
-            shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.10261579603
-        )
+    _test_with_attention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        attention_mechanism_depth=9,
+        create_attention_kwargs=create_attention_kwargs,
+    )
 
-        _test_with_attention(
-            create_attention_mechanism,
-            expected_final_output,
-            expected_final_state,
-            alignment_history=True,
-            expected_final_alignment_history=expected_final_alignment_history,
-            create_query_layer=True,
-            create_attention_kwargs=create_attention_kwargs,
-        )
 
-    def testBahdanauMonotonicNormalized(self):
-        do_some_stuff()
-        create_attention_mechanism = wrapper.BahdanauMonotonicAttention
-        create_attention_kwargs = {"kernel_initializer": "ones", "normalize": True}
-        expected_final_output = basic_decoder.BasicDecoderOutput(
-            rnn_output=ResultSummary(
-                shape=(5, 3, 6), dtype=np.dtype("float32"), mean=0.007140680
-            ),
-            sample_id=ResultSummary(
-                shape=(5, 3), dtype=np.dtype("int32"), mean=3.26666666
-            ),
-        )
-        expected_final_state = wrapper.AttentionWrapperState(
-            cell_state=[
-                ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.47012400),
-                ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=1.0249618),
-            ],
-            attention=ResultSummary(
-                shape=(5, 6), dtype=np.dtype("float32"), mean=0.068432882
-            ),
-            alignments=ResultSummary(
-                shape=(5, 8), dtype=np.dtype("float32"), mean=0.0615656
-            ),
-            attention_state=ResultSummary(
-                shape=(5, 8), dtype=np.dtype("float32"), mean=0.0615656
-            ),
-            alignment_history=(),
-        )
-        expected_final_alignment_history = ResultSummary(
-            shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.07909643
-        )
+def test_not_use_attention_layer():
+    set_random_state_for_tf_and_np()
+    create_attention_mechanism = wrapper.BahdanauAttention
+    create_attention_kwargs = {"kernel_initializer": "ones"}
 
-        _test_with_attention(
-            create_attention_mechanism,
-            expected_final_output,
-            expected_final_state,
-            alignment_history=True,
-            expected_final_alignment_history=expected_final_alignment_history,
-            create_query_layer=True,
-            create_attention_kwargs=create_attention_kwargs,
-        )
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 10), dtype=np.dtype("float32"), mean=0.078317143
+        ),
+        sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype("int32"), mean=4.2),
+    )
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=[
+            ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.89382392),
+            ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=1.722382),
+        ],
+        attention=ResultSummary(
+            shape=(5, 10), dtype=np.dtype("float32"), mean=0.026356646
+        ),
+        alignments=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.125
+        ),
+        alignment_history=(),
+    )
 
-    def testLuongMonotonicNotNormalized(self):
-        do_some_stuff()
-        create_attention_mechanism = wrapper.LuongMonotonicAttention
-
-        expected_final_output = basic_decoder.BasicDecoderOutput(
-            rnn_output=ResultSummary(
-                shape=(5, 3, 6), dtype=np.dtype("float32"), mean=0.003664831
-            ),
-            sample_id=ResultSummary(
-                shape=(5, 3), dtype=np.dtype("int32"), mean=3.06666666
-            ),
-        )
-        expected_final_state = wrapper.AttentionWrapperState(
-            cell_state=[
-                ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.54318606),
-                ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=1.12592840),
-            ],
-            attention=ResultSummary(
-                shape=(5, 6), dtype=np.dtype("float32"), mean=0.059128221
-            ),
-            alignments=ResultSummary(
-                shape=(5, 8), dtype=np.dtype("float32"), mean=0.05112994
-            ),
-            attention_state=ResultSummary(
-                shape=(5, 8), dtype=np.dtype("float32"), mean=0.05112994
-            ),
-            alignment_history=(),
-        )
-        expected_final_alignment_history = ResultSummary(
-            shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.06994973868
-        )
+    _test_with_attention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        attention_layer_size=None,
+        create_query_layer=True,
+        create_attention_kwargs=create_attention_kwargs,
+    )
 
-        _test_with_attention(
-            create_attention_mechanism,
-            expected_final_output,
-            expected_final_state,
-            attention_mechanism_depth=9,
-            alignment_history=True,
-            expected_final_alignment_history=expected_final_alignment_history,
-        )
 
-    def testLuongMonotonicScaled(self):
-        do_some_stuff()
-        create_attention_mechanism = wrapper.LuongMonotonicAttention
-        create_attention_kwargs = {"scale": True}
-
-        expected_final_output = basic_decoder.BasicDecoderOutput(
-            rnn_output=ResultSummary(
-                shape=(5, 3, 6), dtype=np.dtype("float32"), mean=0.003664831
-            ),
-            sample_id=ResultSummary(
-                shape=(5, 3), dtype=np.dtype("int32"), mean=3.06666666
-            ),
-        )
-        expected_final_state = wrapper.AttentionWrapperState(
-            cell_state=[
-                ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.54318606),
-                ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=1.12592840),
-            ],
-            attention=ResultSummary(
-                shape=(5, 6), dtype=np.dtype("float32"), mean=0.059128221
-            ),
-            alignments=ResultSummary(
-                shape=(5, 8), dtype=np.dtype("float32"), mean=0.05112994
-            ),
-            attention_state=ResultSummary(
-                shape=(5, 8), dtype=np.dtype("float32"), mean=0.05112994
-            ),
-            alignment_history=(),
-        )
-        expected_final_alignment_history = ResultSummary(
-            shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.06994973868
-        )
+def test_bahdanau_monotonic_not_normalized():
+    set_random_state_for_tf_and_np()
+    create_attention_mechanism = wrapper.BahdanauMonotonicAttention
+    create_attention_kwargs = {"kernel_initializer": "ones"}
 
-        _test_with_attention(
-            create_attention_mechanism,
-            expected_final_output,
-            expected_final_state,
-            attention_mechanism_depth=9,
-            alignment_history=True,
-            expected_final_alignment_history=expected_final_alignment_history,
-            create_attention_kwargs=create_attention_kwargs,
-        )
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype("float32"), mean=-0.009921653
+        ),
+        sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype("int32"), mean=3.13333333),
+    )
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=[
+            ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.44612807),
+            ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.95786464),
+        ],
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype("float32"), mean=0.038682378
+        ),
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.09778417
+        ),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.09778417
+        ),
+        alignment_history=(),
+    )
+    expected_final_alignment_history = ResultSummary(
+        shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.10261579603
+    )
+
+    _test_with_attention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        create_query_layer=True,
+        create_attention_kwargs=create_attention_kwargs,
+    )
+
+
+def test_bahdanau_monotonic_normalized():
+    set_random_state_for_tf_and_np()
+    create_attention_mechanism = wrapper.BahdanauMonotonicAttention
+    create_attention_kwargs = {"kernel_initializer": "ones", "normalize": True}
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype("float32"), mean=0.007140680
+        ),
+        sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype("int32"), mean=3.26666666),
+    )
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=[
+            ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.47012400),
+            ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=1.0249618),
+        ],
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype("float32"), mean=0.068432882
+        ),
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.0615656
+        ),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.0615656
+        ),
+        alignment_history=(),
+    )
+    expected_final_alignment_history = ResultSummary(
+        shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.07909643
+    )
+
+    _test_with_attention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        create_query_layer=True,
+        create_attention_kwargs=create_attention_kwargs,
+    )
+
+
+def test_luong_monotonic_not_normalized():
+    set_random_state_for_tf_and_np()
+    create_attention_mechanism = wrapper.LuongMonotonicAttention
+
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype("float32"), mean=0.003664831
+        ),
+        sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype("int32"), mean=3.06666666),
+    )
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=[
+            ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.54318606),
+            ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=1.12592840),
+        ],
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype("float32"), mean=0.059128221
+        ),
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.05112994
+        ),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.05112994
+        ),
+        alignment_history=(),
+    )
+    expected_final_alignment_history = ResultSummary(
+        shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.06994973868
+    )
+
+    _test_with_attention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        attention_mechanism_depth=9,
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+    )
+
+
+def test_luong_monotonic_scaled():
+    set_random_state_for_tf_and_np()
+    create_attention_mechanism = wrapper.LuongMonotonicAttention
+    create_attention_kwargs = {"scale": True}
+
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype("float32"), mean=0.003664831
+        ),
+        sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype("int32"), mean=3.06666666),
+    )
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=[
+            ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.54318606),
+            ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=1.12592840),
+        ],
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype("float32"), mean=0.059128221
+        ),
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.05112994
+        ),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.05112994
+        ),
+        alignment_history=(),
+    )
+    expected_final_alignment_history = ResultSummary(
+        shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.06994973868
+    )
+
+    _test_with_attention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        attention_mechanism_depth=9,
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        create_attention_kwargs=create_attention_kwargs,
+    )
 
 
 def test_attention_state_with_keras_rnn():

From b1cbf4d535624fd734b89b7b87d83749c6d430a1 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 12:08:25 +0000
Subject: [PATCH 33/40] Fully converted to pytest.

---
 .../seq2seq/tests/attention_wrapper_test.py      | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index 3c66638fce..b0a946f2a3 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -541,22 +541,10 @@ def test_luong_scaled_dtype(dtype):
     assert isinstance(final_state, wrapper.AttentionWrapperState)
 
 
-def do_some_stuff():
-    np.random.seed(87654321)
-    batch = 64
-    units = 128
-    encoder_timestep = 10
-    encoder_dim = 256
-    decoder_timestep = 12
-    np.random.randn(batch, encoder_timestep, encoder_dim)
-    np.random.randint(1, high=encoder_timestep, size=(batch,)).astype(np.int32)
-    np.random.randn(batch, decoder_timestep, units)
-    np.random.randint(decoder_timestep, size=(batch,)).astype(np.int32)
-
-
 def set_random_state_for_tf_and_np():
     tf.random.set_seed(87654321)
-    do_some_stuff()
+    np.random.seed(87654321)
+    DummyData2()
 
 
 def test_bahdanau_not_normalized():

From 826a61e49eba2899a8b0e984c88f338e7fa6fcb1 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 12:09:28 +0000
Subject: [PATCH 34/40] Minor simplification.

---
 tensorflow_addons/seq2seq/tests/attention_wrapper_test.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index b0a946f2a3..115bd8a4d7 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -451,11 +451,9 @@ def _test_with_attention(
     else:
         state_alignment_history = ()
 
-    final_outputs = tf.nest.map_structure(lambda x: x.numpy(), final_outputs)
-    final_state = tf.nest.map_structure(lambda x: x.numpy(), final_state)
-    state_alignment_history = tf.nest.map_structure(
-        lambda x: x.numpy(), state_alignment_history
-    )
+    final_outputs = tf.nest.map_structure(np.array, final_outputs)
+    final_state = tf.nest.map_structure(np.array, final_state)
+    state_alignment_history = tf.nest.map_structure(np.array, state_alignment_history)
     final_output_info = tf.nest.map_structure(get_result_summary, final_outputs)
 
     final_state_info = tf.nest.map_structure(get_result_summary, final_state)

From 120e4f4215f09e329c82892018c8eff80889258c Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 12:12:12 +0000
Subject: [PATCH 35/40] Some comment.

---
 tensorflow_addons/seq2seq/tests/attention_wrapper_test.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index 115bd8a4d7..2245fc2e8a 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -540,6 +540,10 @@ def test_luong_scaled_dtype(dtype):
 
 
 def set_random_state_for_tf_and_np():
+    """Since the results of the tests have been hardcoded, we need to make sure,
+    when we refactor code that the random state is the same. Meaning that all
+    random functions should be called in the same order.
+    """
     tf.random.set_seed(87654321)
     np.random.seed(87654321)
     DummyData2()

From 85a76b8c4442d0c12cb0b14796a58bcc06bf81eb Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 12:15:27 +0000
Subject: [PATCH 36/40] Removed some unused functions.

---
 .../seq2seq/tests/attention_wrapper_test.py   |  1 -
 tensorflow_addons/utils/test_utils.py         | 44 +------------------
 2 files changed, 2 insertions(+), 43 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index 2245fc2e8a..67cce0ce75 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -17,7 +17,6 @@
 import collections
 
 import pytest
-from absl.testing import parameterized
 import numpy as np
 import tensorflow as tf
 
diff --git a/tensorflow_addons/utils/test_utils.py b/tensorflow_addons/utils/test_utils.py
index 6f8c48b328..4a16cc4b86 100644
--- a/tensorflow_addons/utils/test_utils.py
+++ b/tensorflow_addons/utils/test_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utilities for tf.test.TestCase."""
+"""Utilities for testing Addons."""
 
 import contextlib
 import inspect
@@ -25,13 +25,7 @@
 
 from tensorflow_addons.utils import resource_loader
 
-# TODO: find public API alternative to these
-from tensorflow.python.framework.test_util import (  # noqa: F401
-    run_all_in_graph_and_eager_modes,
-)
-from tensorflow.python.framework.test_util import (  # noqa: F401
-    run_in_graph_and_eager_modes,
-)
+# TODO: copy the layer_test implementation in Addons.
 from tensorflow.python.keras.testing_utils import layer_test  # noqa: F401
 
 
@@ -131,40 +125,6 @@ def decorated(self, *args, **kwargs):
     return decorator
 
 
-def run_all_with_types(dtypes):
-    """Execute all test methods in the given class with and without eager."""
-    base_decorator = run_with_types(dtypes)
-
-    def decorator(cls):
-        for name, method in cls.__dict__.copy().items():
-            if (
-                callable(method)
-                and name.startswith(unittest.TestLoader.testMethodPrefix)
-                and name != "test_session"
-            ):
-                setattr(cls, name, base_decorator(method))
-        return cls
-
-    return decorator
-
-
-def run_with_types(dtypes):
-    def decorator(f):
-        if inspect.isclass(f):
-            raise TypeError(
-                "`run_with_types` only supports test methods. "
-                "Did you mean to use `run_all_with_types`?"
-            )
-
-        def decorated(self, *args, **kwargs):
-            for t in dtypes:
-                f(self, *args, dtype=t, **kwargs)
-
-        return decorated
-
-    return decorator
-
-
 def finalizer():
     tf.config.experimental_run_functions_eagerly(False)
 

From a90ab5d537cf5ef3cdd7aa43a5e019cdfaf1203d Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 12:20:16 +0000
Subject: [PATCH 37/40] Removed some elements from the test_no_deprecated_v1.

---
 tools/testing/source_code_test.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tools/testing/source_code_test.py b/tools/testing/source_code_test.py
index 64b29226e9..bfe5d3bec4 100644
--- a/tools/testing/source_code_test.py
+++ b/tools/testing/source_code_test.py
@@ -144,16 +144,10 @@ def test_no_deprecated_v1():
     blacklist = [
         "tensorflow_addons/text/skip_gram_ops.py",
         "tensorflow_addons/text/tests/skip_gram_ops_test.py",
-        "tensorflow_addons/optimizers/tests/lazy_adam_test.py",
-        "tensorflow_addons/metrics/tests/matthews_correlation_coefficient_test.py",
-        "tensorflow_addons/seq2seq/tests/decoder_test.py",
-        "tensorflow_addons/metrics/tests/cohens_kappa_test.py",
-        "tensorflow_addons/optimizers/tests/cyclical_learning_rate_test.py",
         "tensorflow_addons/metrics/tests/f_scores_test.py",
         "tensorflow_addons/seq2seq/tests/basic_decoder_test.py",
         "tensorflow_addons/seq2seq/tests/beam_search_decoder_test.py",
         "tensorflow_addons/seq2seq/decoder.py",
-        "tensorflow_addons/metrics/tests/multilabel_confusion_matrix_test.py",
         "tensorflow_addons/seq2seq/tests/attention_wrapper_test.py",
     ]
     for file_path, line_idx, line in get_lines_of_source_code(blacklist):

From 03883bbdbe6af21637243b45bdf1b91cae4be09d Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Sun, 19 Apr 2020 12:24:02 +0000
Subject: [PATCH 38/40] Removed some files from blacklists.

---
 .flake8 | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/.flake8 b/.flake8
index d889c65d68..07ccf37be1 100644
--- a/.flake8
+++ b/.flake8
@@ -27,12 +27,5 @@ per-file-ignores =
     tensorflow_addons/image/tests/utils_test.py:N802
     tensorflow_addons/image/tests/color_ops_test.py:N802
     tensorflow_addons/optimizers/tests/conditional_gradient_test.py:N802
-    tensorflow_addons/optimizers/tests/lazy_adam_test.py:N802
-    tensorflow_addons/seq2seq/tests/attention_wrapper_test.py:N802
-    tensorflow_addons/seq2seq/tests/basic_decoder_test.py:N802
-    tensorflow_addons/seq2seq/tests/decoder_test.py:N802
-    tensorflow_addons/seq2seq/tests/beam_search_decoder_test.py:N802
-    tensorflow_addons/seq2seq/tests/beam_search_ops_test.py:N802
-    tensorflow_addons/optimizers/tests/cyclical_learning_rate_test.py:N802
     # variable ... in function should be lowercase
     tensorflow_addons/callbacks/tests/time_stopping_test.py:N806

From 39698a2bbc7d8cc060c1088537c3d6e474cf3573 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Thu, 23 Apr 2020 14:43:06 +0000
Subject: [PATCH 39/40] Forgot to remove run_eagerly.

---
 tensorflow_addons/seq2seq/tests/attention_wrapper_test.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index 67cce0ce75..85c69832bf 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -210,10 +210,7 @@ def call(self, inputs):
 
     dummy_data = DummyData()
     model = MyModel(vocab, embedding_dim, dummy_data.memory_size, dummy_data.units)
-    if tf.executing_eagerly():
-        model.compile("rmsprop", "mse", run_eagerly=True)
-    else:
-        model.compile("rmsprop", "mse")
+    model.compile("rmsprop", "mse")
 
     x = np.random.randint(
         vocab, size=(num_batches * dummy_data.batch, dummy_data.timestep)

From 702453beeac8476e2d321d6cc717bfb27567e84f Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Thu, 23 Apr 2020 14:44:32 +0000
Subject: [PATCH 40/40] Removed tf.executing_eagerly.

---
 tensorflow_addons/seq2seq/tests/attention_wrapper_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
index 85c69832bf..80ce27e4d7 100644
--- a/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
+++ b/tensorflow_addons/seq2seq/tests/attention_wrapper_test.py
@@ -412,7 +412,7 @@ def _test_with_attention(
     assert isinstance(final_outputs, basic_decoder.BasicDecoderOutput)
     assert isinstance(final_state, wrapper.AttentionWrapperState)
 
-    expected_time = max(decoder_sequence_length) if tf.executing_eagerly() else None
+    expected_time = max(decoder_sequence_length)
     assert (batch_size, expected_time, attention_depth) == tuple(
         final_outputs.rnn_output.get_shape().as_list()
     )