[PyTorch] Flip polarity of masked_softmax mask (#78)

swolchok · pytorchmergebot · commit 8f4f1638bbc1 · 2022-04-02T00:17:49.000Z
Summary: X-link: pytorch/pytorch-canary#78 Pull Request resolved: pytorch#75039 It didn't match torch.nn.MultiheadAttention. Now it does. ghstack-source-id: 152815449 Test Plan: updated tests Reviewed By: zrphercule Differential Revision: D34929186 fbshipit-source-id: 1eaee615bafd5a6f058f1faefa54f8f4aa01c92e (cherry picked from commit 00eea72)
diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp
@@ -170,7 +170,7 @@ void host_softmax(
             }
           } else {
             for (const auto d : c10::irange(0, dim_size)) {
-              if (mask_data[d * dim_stride]) {
+              if (!mask_data[d * dim_stride]) {
                 max_input = is_meaningful_max
                     ? std::max(max_input, input_data[d * dim_stride])
                     : input_data[d * dim_stride];
@@ -183,7 +183,7 @@ void host_softmax(
           acc_type<scalar_t, false> tmpsum = 0;
           for (const auto d : c10::irange(dim_size)) {
             scalar_t z{};
-            if (!MaskedSoftMax || mask_data[d * dim_stride]) {
+            if (!MaskedSoftMax || !mask_data[d * dim_stride]) {
               z = std::exp(input_data[d * dim_stride] - max_input);
             } else {
               z = 0;
diff --git a/aten/src/ATen/native/cuda/PersistentSoftmax.cuh b/aten/src/ATen/native/cuda/PersistentSoftmax.cuh
@@ -126,7 +126,7 @@ __global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batc
                 if (!is_transformer_mask) {
                     idx += i*element_count;
                 }
-                if (mask[idx]) {
+                if (!mask[idx]) {
                     max_value[i] = (is_meaningful_max && max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
                     is_meaningful_max = true;
                 }
@@ -160,7 +160,7 @@ __global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batc
                     idx += i*element_count;
                 }
 
-                if (mask[idx]) {
+                if (!mask[idx]) {
                     if (is_log_softmax) {
                         sum[i] += std::exp(elements[i][it] - max_value[i]);
                     } else {
@@ -188,7 +188,7 @@ __global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batc
                     if (!is_transformer_mask) {
                         idx += i*element_count;
                     }
-                    if (!mask[idx]) {
+                    if (mask[idx]) {
                         dst[i*element_count+it*WARP_SIZE] = 0;
                         continue;
                     }
diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu
@@ -959,8 +959,7 @@ Tensor masked_softmax_cuda(const Tensor& input, const Tensor& mask) {
           input.scalar_type(),
           "masked_softmax",
           [&] {
-            Tensor mask_not = mask.logical_not();
-            output = at::softmax(input.masked_fill(mask_not, -std::numeric_limits<scalar_t>::infinity()), -1);
+            output = at::softmax(input.masked_fill(mask, -std::numeric_limits<scalar_t>::infinity()), -1);
           });
         return output;
     }
diff --git a/test/test_nn.py b/test/test_nn.py
@@ -16155,6 +16155,7 @@ def test_masked_softmax(self, device):
                 mask = mask.cuda()
             mask = mask.reshape(B, 1, 1, L).expand(B, num_heads, L, L).bool()
             native_res = torch._masked_softmax(input, mask)
+            mask = ~mask
             mask = mask.float()
 
             def slow_masked_softmax(input, mask):
@@ -16178,6 +16179,7 @@ def test_masked_softmax_transformer_layout(self, device):
         mask = mask.bool()
         native_res = torch._masked_softmax(input, mask)
         mask = mask.reshape(B, 1, 1, L).expand(B, num_heads, L, L)
+        mask = ~mask
         mask = mask.float()
 
         def slow_masked_softmax(input, mask):

Original file line number	Diff line number	Diff line change
`@@ -126,7 +126,7 @@ __global__ void softmax_warp_forward(output_t dst, const input_t src, int batc`
`126`	`126`	`if (!is_transformer_mask) {`
`127`	`127`	`idx += i*element_count;`
`128`	`128`	`}`
`129`		`- if (mask[idx]) {`
	`129`	`+ if (!mask[idx]) {`
`130`	`130`	`max_value[i] = (is_meaningful_max && max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];`
`131`	`131`	`is_meaningful_max = true;`
`132`	`132`	`}`
`@@ -160,7 +160,7 @@ __global__ void softmax_warp_forward(output_t dst, const input_t src, int batc`
`160`	`160`	`idx += i*element_count;`
`161`	`161`	`}`
`162`	`162`
`163`		`- if (mask[idx]) {`
	`163`	`+ if (!mask[idx]) {`
`164`	`164`	`if (is_log_softmax) {`
`165`	`165`	`sum[i] += std::exp(elements[i][it] - max_value[i]);`
`166`	`166`	`} else {`
`@@ -188,7 +188,7 @@ __global__ void softmax_warp_forward(output_t dst, const input_t src, int batc`
`188`	`188`	`if (!is_transformer_mask) {`
`189`	`189`	`idx += i*element_count;`
`190`	`190`	`}`
`191`		`- if (!mask[idx]) {`
	`191`	`+ if (mask[idx]) {`
`192`	`192`	`dst[ielement_count+itWARP_SIZE] = 0;`
`193`	`193`	`continue;`
`194`	`194`	`}`