pytorch
diff --git a/‎.github/PULL_REQUEST_TEMPLATE.md
Lines changed: 1 addition & 8 deletions b/‎.github/PULL_REQUEST_TEMPLATE.md
Lines changed: 1 addition & 8 deletions
diff --git a/‎.github/scale-config.yml
Lines changed: 1 addition & 1 deletion b/‎.github/scale-config.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/_ios-build-test.yml
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/_ios-build-test.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎.lintrunner.toml
Lines changed: 1 addition & 0 deletions b/‎.lintrunner.toml
Lines changed: 1 addition & 0 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 42 additions & 51 deletions b/‎CMakeLists.txt
Lines changed: 42 additions & 51 deletions
diff --git a/‎aten/src/ATen/cpu/vec/vec256/vec256_qint.h
Lines changed: 10 additions & 6 deletions b/‎aten/src/ATen/cpu/vec/vec256/vec256_qint.h
Lines changed: 10 additions & 6 deletions
diff --git a/‎aten/src/ATen/native/Onehot.cpp
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/native/Onehot.cpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/cpu/Loops.h
Lines changed: 0 additions & 5 deletions b/‎aten/src/ATen/native/cpu/Loops.h
Lines changed: 0 additions & 5 deletions
diff --git a/‎aten/src/ATen/native/cuda/PersistentSoftmax.cuh
Lines changed: 5 additions & 3 deletions b/‎aten/src/ATen/native/cuda/PersistentSoftmax.cuh
Lines changed: 5 additions & 3 deletions
diff --git a/‎aten/src/ATen/native/mps/operations/BitwiseBinaryOps.mm
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/native/mps/operations/BitwiseBinaryOps.mm
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/mps/operations/Distributions.mm
Lines changed: 0 additions & 14 deletions b/‎aten/src/ATen/native/mps/operations/Distributions.mm
Lines changed: 0 additions & 14 deletions
diff --git a/‎benchmarks/cpp/nvfuser/CMakeLists.txt
Lines changed: 3 additions & 1 deletion b/‎benchmarks/cpp/nvfuser/CMakeLists.txt
Lines changed: 3 additions & 1 deletion
diff --git a/‎build_variables.bzl
Lines changed: 1 addition & 1 deletion b/‎build_variables.bzl
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmake/Dependencies.cmake
Lines changed: 2 additions & 6 deletions b/‎cmake/Dependencies.cmake
Lines changed: 2 additions & 6 deletions
@@ -1,8 +1 @@
-### Description
-<!-- What did you change and why was it needed? -->
-
-### Issue
-<!-- Link to Issue ticket or RFP -->
-
-### Testing
-<!-- How did you test your change? -->
+Fixes #ISSUE_NUMBER
@@ -65,5 +65,5 @@ runner_types:
   windows.8xlarge.nvidia.gpu:
     instance_type: p3.2xlarge
     os: windows
-    max_available: 50
+    max_available: 100
     disk_size: 256
@@ -140,6 +140,7 @@ jobs:
           scripts/build_ios.sh
 
       - name: Run Build Test
+        timeout-minutes: 5
         run: |
           PROFILE=PyTorch_CI_2022
           # run the ruby build script
 
@@ -102,6 +102,7 @@ exclude_patterns = [
     'torch/distributed/elastic/agent/server/api.py',
     'torch/testing/_internal/**',
     'torch/distributed/fsdp/fully_sharded_data_parallel.py',
+    'torch/distributed/distributed_c10d.py',
     # TODO(suo): these exclusions were added just to get lint clean on master.
     # Follow up to do more target suppressions and remove them.
     'torch/distributed/fsdp/flatten_params_wrapper.py',
 
@@ -43,7 +43,7 @@ set(CMAKE_C_STANDARD   11 CACHE STRING "The C standard whose features are reques
 if(DEFINED GLIBCXX_USE_CXX11_ABI)
   if(${GLIBCXX_USE_CXX11_ABI} EQUAL 1)
     set(CXX_STANDARD_REQUIRED ON)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
+    string(APPEND CMAKE_CXX_FLAGS " -D_GLIBCXX_USE_CXX11_ABI=1")
   else()
     # Please note this is required in order to ensure compatibility between gcc 9 and gcc 7
     # This could be removed when all Linux PyTorch binary builds are compiled by the same toolchain again
@@ -799,22 +799,22 @@ if(NOT MSVC)
   # Details at http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1459
   string(APPEND CMAKE_CXX_FLAGS " -Wall")
   string(APPEND CMAKE_CXX_FLAGS " -Wextra")
-  string(APPEND CMAKE_CXX_FLAGS " -Werror=return-type")
+  append_cxx_flag_if_supported("-Werror=return-type" CMAKE_CXX_FLAGS)
   if(NOT USE_CUDNN)
     # Temporary fix to ignore non virtual dtor error if cudnn is used. A
     # separate PR to cudnn_frontend is needed to address this later on
-    string(APPEND CMAKE_CXX_FLAGS " -Werror=non-virtual-dtor")
+    append_cxx_flag_if_supported("-Werror=non-virtual-dtor" CMAKE_CXX_FLAGS)
   endif()
-  string(APPEND CMAKE_CXX_FLAGS " -Wno-missing-field-initializers")
-  string(APPEND CMAKE_CXX_FLAGS " -Wno-type-limits")
-  string(APPEND CMAKE_CXX_FLAGS " -Wno-array-bounds")
-  string(APPEND CMAKE_CXX_FLAGS " -Wno-unknown-pragmas")
-  string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-parameter")
-  string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-function")
-  string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-result")
-  string(APPEND CMAKE_CXX_FLAGS " -Wno-strict-overflow")
-  string(APPEND CMAKE_CXX_FLAGS " -Wno-strict-aliasing")
-  string(APPEND CMAKE_CXX_FLAGS " -Wno-error=deprecated-declarations")
+  append_cxx_flag_if_supported("-Wno-missing-field-initializers" CMAKE_CXX_FLAGS)
+  append_cxx_flag_if_supported("-Wno-type-limits" CMAKE_CXX_FLAGS)
+  append_cxx_flag_if_supported("-Wno-array-bounds" CMAKE_CXX_FLAGS)
+  append_cxx_flag_if_supported("-Wno-unknown-pragmas" CMAKE_CXX_FLAGS)
+  append_cxx_flag_if_supported("-Wno-unused-parameter" CMAKE_CXX_FLAGS)
+  append_cxx_flag_if_supported("-Wno-unused-function" CMAKE_CXX_FLAGS)
+  append_cxx_flag_if_supported("-Wno-unused-result" CMAKE_CXX_FLAGS)
+  append_cxx_flag_if_supported("-Wno-strict-overflow" CMAKE_CXX_FLAGS)
+  append_cxx_flag_if_supported("-Wno-strict-aliasing" CMAKE_CXX_FLAGS)
+  append_cxx_flag_if_supported("-Wno-error=deprecated-declarations" CMAKE_CXX_FLAGS)
   if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
     string(APPEND CMAKE_CXX_FLAGS " -Wno-range-loop-analysis")
     string(APPEND CMAKE_CXX_FLAGS " -Wno-pass-failed")
@@ -855,32 +855,31 @@ if(NOT MSVC)
     endif()
   endif()
 
-  string(APPEND CMAKE_CXX_FLAGS " -Wno-error=pedantic")
-  string(APPEND CMAKE_CXX_FLAGS " -Wno-error=redundant-decls")
-  string(APPEND CMAKE_CXX_FLAGS " -Wno-error=old-style-cast")
+  append_cxx_flag_if_supported("-Wno-error=pedantic" CMAKE_CXX_FLAGS)
+  append_cxx_flag_if_supported("-Wno-error=redundant-decls" CMAKE_CXX_FLAGS)
+  append_cxx_flag_if_supported("-Wno-error=old-style-cast" CMAKE_CXX_FLAGS)
   # These flags are not available in GCC-4.8.5. Set only when using clang.
   # Compared against https://gcc.gnu.org/onlinedocs/gcc-4.8.5/gcc/Option-Summary.html
   if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-    string(APPEND CMAKE_CXX_FLAGS " -Wconstant-conversion")
-    string(APPEND CMAKE_CXX_FLAGS " -Wno-invalid-partial-specialization")
-    string(APPEND CMAKE_CXX_FLAGS " -Wno-typedef-redefinition")
-    string(APPEND CMAKE_CXX_FLAGS " -Wno-unknown-warning-option")
-    string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-private-field")
-    string(APPEND CMAKE_CXX_FLAGS " -Wno-inconsistent-missing-override")
-    string(APPEND CMAKE_CXX_FLAGS " -Wno-aligned-allocation-unavailable")
-    string(APPEND CMAKE_CXX_FLAGS " -Wno-c++14-extensions")
-    string(APPEND CMAKE_CXX_FLAGS " -Wno-constexpr-not-const")
-    string(APPEND CMAKE_CXX_FLAGS " -Wno-missing-braces")
-    string(APPEND CMAKE_CXX_FLAGS " -Qunused-arguments")
+    append_cxx_flag_if_supported("-Wconstant-conversion" CMAKE_CXX_FLAGS)
+    append_cxx_flag_if_supported("-Wno-invalid-partial-specialization" CMAKE_CXX_FLAGS)
+    append_cxx_flag_if_supported("-Wno-typedef-redefinition" CMAKE_CXX_FLAGS)
+    append_cxx_flag_if_supported("-Wno-unused-private-field" CMAKE_CXX_FLAGS)
+    append_cxx_flag_if_supported("-Wno-inconsistent-missing-override" CMAKE_CXX_FLAGS)
+    append_cxx_flag_if_supported("-Wno-aligned-allocation-unavailable" CMAKE_CXX_FLAGS)
+    append_cxx_flag_if_supported("-Wno-c++14-extensions" CMAKE_CXX_FLAGS)
+    append_cxx_flag_if_supported("-Wno-constexpr-not-const" CMAKE_CXX_FLAGS)
+    append_cxx_flag_if_supported("-Wno-missing-braces" CMAKE_CXX_FLAGS)
+    append_cxx_flag_if_supported("-Qunused-arguments" CMAKE_CXX_FLAGS)
     if(${COLORIZE_OUTPUT})
-      string(APPEND CMAKE_CXX_FLAGS " -fcolor-diagnostics")
     endif()
   endif()
-  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9)
-    if(${COLORIZE_OUTPUT})
-      string(APPEND CMAKE_CXX_FLAGS " -fdiagnostics-color=always")
-    endif()
+
+  if(${COLORIZE_OUTPUT})
+    append_cxx_flag_if_supported("-fcolor-diagnostics" CMAKE_CXX_FLAGS)
+    append_cxx_flag_if_supported("-fdiagnostics-color=always" CMAKE_CXX_FLAGS)
   endif()
+
   if((APPLE AND (NOT ("${CLANG_VERSION_STRING}" VERSION_LESS "9.0")))
     OR(CMAKE_COMPILER_IS_GNUCXX
     AND(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0 AND NOT APPLE)))
@@ -895,21 +894,15 @@ if(NOT MSVC)
     endif()
   endif(WERROR)
   if(NOT APPLE)
-    string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-but-set-variable")
-    string(APPEND CMAKE_CXX_FLAGS " -Wno-maybe-uninitialized")
+    append_cxx_flag_if_supported("-Wno-unused-but-set-variable" CMAKE_CXX_FLAGS)
+    append_cxx_flag_if_supported("-Wno-maybe-uninitialized" CMAKE_CXX_FLAGS)
   endif()
   string(APPEND CMAKE_CXX_FLAGS_DEBUG " -fno-omit-frame-pointer -O0")
   string(APPEND CMAKE_LINKER_FLAGS_DEBUG " -fno-omit-frame-pointer -O0")
-  string(APPEND CMAKE_CXX_FLAGS " -fno-math-errno")
-  string(APPEND CMAKE_CXX_FLAGS " -fno-trapping-math")
-  check_cxx_compiler_flag("-Werror=format" HAS_WERROR_FORMAT)
-  if(HAS_WERROR_FORMAT)
-    string(APPEND CMAKE_CXX_FLAGS " -Werror=format")
-  endif()
-  check_cxx_compiler_flag("-Werror=cast-function-type" HAS_WERROR_CAST_FUNCTION_TYPE)
-  if(HAS_WERROR_CAST_FUNCTION_TYPE)
-    string(APPEND CMAKE_CXX_FLAGS " -Werror=cast-function-type")
-  endif()
+  append_cxx_flag_if_supported("-fno-math-errno" CMAKE_CXX_FLAGS)
+  append_cxx_flag_if_supported("-fno-trapping-math" CMAKE_CXX_FLAGS)
+  append_cxx_flag_if_supported("-Werror=format" CMAKE_CXX_FLAGS)
+  append_cxx_flag_if_supported("-Werror=cast-function-type" CMAKE_CXX_FLAGS)
   check_cxx_compiler_flag("-Werror=sign-compare" HAS_WERROR_SIGN_COMPARE)
   # This doesn't work globally so we use the test on specific
   # target_compile_options
@@ -971,19 +964,17 @@ if(APPLE)
       string(APPEND CMAKE_CXX_FLAGS " -DUSE_MPS -fno-objc-arc")
       string(APPEND CMAKE_SHARED_LINKER_FLAGS " -weak_framework Foundation -weak_framework MetalPerformanceShaders -weak_framework MetalPerformanceShadersGraph -weak_framework Metal")
     endif()
-    string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-private-field")
-    string(APPEND CMAKE_CXX_FLAGS " -Wno-missing-braces")
-    string(APPEND CMAKE_CXX_FLAGS " -Wno-c++14-extensions")
-    string(APPEND CMAKE_CXX_FLAGS " -Wno-constexpr-not-const")
+    append_cxx_flag_if_supported("-Wno-unused-private-field" CMAKE_CXX_FLAGS)
+    append_cxx_flag_if_supported("-Wno-missing-braces" CMAKE_CXX_FLAGS)
+    append_cxx_flag_if_supported("-Wno-c++14-extensions" CMAKE_CXX_FLAGS)
+    append_cxx_flag_if_supported("-Wno-constexpr-not-const" CMAKE_CXX_FLAGS)
 endif()
 
 if(EMSCRIPTEN)
   string(APPEND CMAKE_CXX_FLAGS " -Wno-implicit-function-declaration -DEMSCRIPTEN -s DISABLE_EXCEPTION_CATCHING=0")
 endif()
 
-if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0.0)
-  string(APPEND CMAKE_CXX_FLAGS " -Wno-stringop-overflow")
-endif()
+append_cxx_flag_if_supported("-Wno-stringop-overflow" CMAKE_CXX_FLAGS)
 
 if(ANDROID AND (NOT ANDROID_DEBUG_SYMBOLS))
   if(CMAKE_COMPILER_IS_GNUCXX)
 
@@ -417,10 +417,12 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
     // This is needed because the compiler emits awful code for the default
     // constructor for moving the enum
     // NOLINTNEXTLINE(clang-diagnostic-deprecated-copy)
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wdeprecated-copy"
+    C10_CLANG_DIAGNOSTIC_PUSH()
+    #if C10_CLANG_HAS_WARNING("-Wdeprecated-copy")
+    C10_CLANG_DIAGNOSTIC_IGNORE("-Wdeprecated-copy")
+    #endif
     Vectorized(const Vectorized<c10::qint8>& other) : Vectorizedqi(other.vals) { }
-    #pragma clang diagnostic pop
+    C10_CLANG_DIAGNOSTIC_POP()
 
     void store(void* ptr, int count = size()) const {
         if (count != size()) {
@@ -580,10 +582,12 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
     }
 
     // NOLINTNEXTLINE(clang-diagnostic-deprecated-copy)
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wdeprecated-copy"
+    C10_CLANG_DIAGNOSTIC_PUSH()
+    #if C10_CLANG_HAS_WARNING("-Wdeprecated-copy")
+    C10_CLANG_DIAGNOSTIC_IGNORE("-Wdeprecated-copy")
+    #endif
     Vectorized(const Vectorized<c10::quint8>& other) : Vectorizedqi(other.vals) { }
-    #pragma clang diagnostic pop
+    C10_CLANG_DIAGNOSTIC_POP()
 
     void store(void* ptr, int count = size()) const {
         if (count != size()) {
 
@@ -23,14 +23,14 @@ Tensor one_hot(const Tensor &self, int64_t num_classes) {
     }
 
     // non-empty tensor
-    if (self.device().type() != at::kCUDA) {
+    if (self.device().type() != at::kCUDA && self.device().type() != at::kMPS) {
       //for cuda, rely on device assert thrown by scatter
       TORCH_CHECK(self.min().item().toLong() >= 0, "Class values must be non-negative.");
     }
     if (num_classes == -1) {
         num_classes = self.max().item().toLong() + 1;
     } else {
-        if (self.device().type() != at::kCUDA) {
+        if (self.device().type() != at::kCUDA && self.device().type() != at::kMPS) {
           //rely on device asserts from scatter to avoid sync here
           TORCH_CHECK(num_classes > self.max().item().toLong(), "Class values must be smaller than num_classes.");
         } else {
 
@@ -36,11 +36,6 @@
 #include <ATen/native/TensorIteratorDynamicCasting.h>
 #include <ATen/cpu/vec/vec.h>
 
-#ifndef _MSC_VER
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-but-set-parameter"
-#endif
-
 namespace at { namespace native { inline namespace CPU_CAPABILITY {
 
 using namespace vec;
 
@@ -90,7 +90,7 @@ __global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batc
     dst += idx_offset;
 
     if (is_transformer_mask) {
-        mask += (idx_offset / head_chunk_size) * stride + local_idx;
+        mask += ((first_batch * stride) / head_chunk_size) * stride + local_idx;
     } else {
         mask += idx_offset;
     }
@@ -117,13 +117,14 @@ __global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batc
     acc_t max_value[WARP_BATCH];
     #pragma unroll
     for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : element_count;
         bool is_meaningful_max = false;
         max_value[i] = elements[i][0];
         #pragma unroll
         for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
             if (is_masked) {
                 int idx = it*WARP_SIZE;
-                if ((idx + local_idx) < element_count) {
+                if ((idx + local_idx) < batch_element_count) {
                     if (!is_transformer_mask) {
                         idx += i*element_count;
                     }
@@ -147,6 +148,7 @@ __global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batc
     acc_t sum[WARP_BATCH] { 0.0f };
     #pragma unroll
     for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : element_count;
         #pragma unroll
         for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
             if (!is_masked) {
@@ -158,7 +160,7 @@ __global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batc
                 }
             } else {
                 int idx = it*WARP_SIZE;
-                bool valid = (idx + local_idx) < element_count;
+                bool valid = (idx + local_idx) < batch_element_count;
                 if (!is_transformer_mask) {
                     idx += i*element_count;
                 }
 
@@ -204,9 +204,9 @@ void handle_tensor_scalar_binary_op(const at::Tensor& self, const at::Scalar& ot
     [commandEncoder setBuffer:outBuf offset:output.storage_offset()*output.itemsize() atIndex:1];
     [commandEncoder setBuffer:selfBuf offset:self.storage_offset()*self.itemsize()  atIndex:2];
     [commandEncoder setBytes:&sval length:sizeof(sval) atIndex:3];
-    [commandEncoder dispatchThreadgroups:MTLSizeMake((length + 511) / 512, 1, 1)
-                    threadsPerThreadgroup:MTLSizeMake(512, 1, 1)];
+    dispatch1DJob(commandEncoder, cplState, length);
     [commandEncoder endEncoding];
+    stream->commit(true);
   });
 }
 
 
@@ -1,16 +1,8 @@
 //  Copyright © 2022 Apple Inc.
 
-#include <ATen/ATen.h>
-#include <ATen/Tensor.h>
-#include <ATen/Utils.h>
-#include <ATen/native/UnaryOps.h>
-#include <ATen/Dispatch.h>
 #include <ATen/native/Distributions.h>
 #include <ATen/native/DistributionTemplates.h>
-#include <ATen/native/TensorIterator.h>
-#include <ATen/mps/MPSStream.h>
 #include <ATen/native/mps/OperationUtils.h>
-#include <torch/library.h>
 
 namespace at {
 namespace native {
@@ -198,11 +190,6 @@ Tensor normal_mps(const Tensor& mean, const Tensor& std, c10::optional<Generator
 }
 
 Tensor& normal_mps_out(double mean, const Tensor& std, c10::optional<Generator> gen, Tensor& output) {
-  TORCH_CHECK(
-    std.min().ge(0).item<bool>(),
-    "normal expects all elements of std >= 0.0");
-
-
   Tensor mean_t = empty_mps(
                       output.sizes(),
                       output.scalar_type(),
@@ -218,7 +205,6 @@ Tensor normal_mps(const Tensor& mean, const Tensor& std, c10::optional<Generator
 
 Tensor& normal_mps_out(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen, Tensor& output) {
   TORCH_CHECK(!std.is_complex(), "normal expects standard deviation to be non-complex");
-  TORCH_CHECK(std.numel() == 0 || std.min().ge(0).item<bool>(), "normal expects all elements of std >= 0.0");
   // Check that mean and std have same number of elements
   TORCH_CHECK(mean.numel() == std.numel(), "normal_mps_out: mean and std must have same number of elements")
 
 
@@ -26,7 +26,9 @@ if(USE_CUDA)
 
   target_link_libraries(nvfuser_bench PRIVATE torch_library benchmark)
   if(NOT MSVC)
-    target_compile_options(nvfuser_bench PRIVATE -Wno-unused-variable -Wno-deprecated-copy -Werror)
+    target_compile_options_if_supported(nvfuser_bench -Werror)
+    target_compile_options_if_supported(nvfuser_bench -Wno-unused-variable)
+    target_compile_options_if_supported(nvfuser_bench -Wno-deprecated-copy)
   endif()
 
 endif()
@@ -130,6 +130,7 @@ libtorch_profiler_sources = [
     "torch/csrc/autograd/profiler_kineto.cpp",
     "torch/csrc/profiler/api.cpp",
     "torch/csrc/profiler/collection.cpp",
+    "torch/csrc/profiler/execution_graph_observer.cpp",
     "torch/csrc/profiler/kineto_shim.cpp",
     "torch/csrc/profiler/nvtx_observer.cpp",
     "torch/csrc/profiler/kineto_client_interface.cpp",
@@ -917,7 +918,6 @@ libtorch_python_core_sources = [
     "torch/csrc/monitor/python_init.cpp",
     "torch/csrc/multiprocessing/init.cpp",
     "torch/csrc/onnx/init.cpp",
-    "torch/csrc/profiler/execution_graph_observer.cpp",
     "torch/csrc/serialization.cpp",
     "torch/csrc/tensor/python_tensor.cpp",
     "torch/csrc/utils/init.cpp",
 
@@ -823,12 +823,8 @@ if(USE_FBGEMM)
     set_property(TARGET fbgemm PROPERTY POSITION_INDEPENDENT_CODE ON)
     if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 13.0.0)
       # See https://github.com/pytorch/pytorch/issues/74352
-      target_compile_options(asmjit PRIVATE -Wno-deprecated-copy)
-      if(("${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13.1.6)
-        OR("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0.0))
-        # -Wno-unused-but-set-variable doesn't exist in Apple clang version 13.0.0 (clang-1300.0.29.30)
-        target_compile_options(asmjit PRIVATE -Wno-unused-but-set-variable)
-      endif()
+      target_compile_options_if_supported(asmjit -Wno-deprecated-copy)
+      target_compile_options_if_supported(asmjit -Wno-unused-but-set-variable)
     endif()
   endif()