pytorch · JakeStevens · Jul 2, 2025 · Apr 25, 2025 · Apr 25, 2025 · May 26, 2025
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
@@ -271,6 +271,36 @@ jobs:
           exit 1
         fi
 
+  nxp-build-test:
+    name: nxp-build-test
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-arm-sdk
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        # Build
+        cmake -DEXECUTORCH_BUILD_NXP_NEUTRON=ON -Bcmake-out .
+        cmake --build cmake-out --target executorch_delegate_neutron --config Release
+
+        # Build check for the neutron backend library
+        lib_neutron="cmake-out/backends/nxp/libexecutorch_delegate_neutron.a"
+        if [ -f $lib_neutron ]; then
+            echo "Neutron backend library built."
+        else
+            echo "Neutron backend library not found!"
+            exit 1
+        fi
+
   test-coreml-delegate:
     name: test-coreml-delegate
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main

@@ -485,6 +485,10 @@ if(EXECUTORCH_BUILD_CADENCE)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cadence)
 endif()
 
+if(EXECUTORCH_BUILD_NXP_NEUTRON)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/nxp)
+endif()
+
 if(EXECUTORCH_BUILD_COREML)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/coreml)
 endif()

@@ -0,0 +1,18 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set(
+  _common_include_directories 
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../..
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../runtime/core/portable_type/c10
+)
+add_compile_definitions(C10_USING_CUSTOM_GENERATED_MACROS)
+
+set(_neutron_sources ${CMAKE_CURRENT_SOURCE_DIR}/runtime/NeutronBackend.cpp )
+
+add_library(executorch_delegate_neutron STATIC ${_neutron_sources})
+target_include_directories(
+  executorch_delegate_neutron PUBLIC ${_common_include_directories}
+)
diff --git a/backends/nxp/runtime/NeutronBackend.cpp b/backends/nxp/runtime/NeutronBackend.cpp
@@ -0,0 +1,413 @@
+/*
+ * Copyright 2024 NXP
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * Implementation of the backend for the NXP Neutron NPU.
+ */
+
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+
+#include "NeutronDriver.h"
+#include "NeutronErrors.h"
+
+using namespace std;
+
+namespace torch {
+namespace executor {
+namespace neutron {
+
+// All the memory need to be aligned with 16
+#define BUFFER_ALIGNMENT 16
+#define ALIGN_SIZE(size) \
+  ((size + BUFFER_ALIGNMENT - 1) & (~(BUFFER_ALIGNMENT - 1)))
+
+/* Header schema:
+        +----------------------------------+-----------------------------------+
+        | Input TensorFormats length (1B)  | Output TensorFormats length (1B)  |
+        +----------------------------------+-----------------------------------+
+        | 1st input tensor format (1B)     | [nth* input tensor format (1B)]   |
+        +----------------------------------+-----------------------------------+
+        | 1st output tensor format (1B)    | [nth* output tensor format (1B)]  |
+        +----------------------------------+-----------------------------------+
+*/
+#define ITEM_SIZE 1 // 1 Byte
+#define INPUT_TENSOR_FORMAT_LEN_POS 0
+#define OUTPUT_TENSOR_FORMAT_LEN_POS 1
+#define INPUT_TENSOR_FORMAT_ARRAY_ADDR(base) (base + 2 * ITEM_SIZE)
+#define OUTPUT_TENSOR_FORMAT_ARRAY_ADDR(base) \
+  (base + 2 * ITEM_SIZE + base[INPUT_TENSOR_FORMAT_LEN_POS])
+#define PAYLOAD_ADDR(base)                                 \
+  (base +                                                  \
+   ALIGN_SIZE(                                             \
+       2 * ITEM_SIZE + base[INPUT_TENSOR_FORMAT_LEN_POS] + \
+       base[OUTPUT_TENSOR_FORMAT_LEN_POS]))
+
+// Aggregate neutron model handle and data structures into one.
+typedef struct {
+  int numInputs = 0;
+  int numOutputs = 0;
+  uint32_t scratchSize = 0;
+  NeutronModelConfig mcfg;
+  NeutronDataConfig dcfg;
+  NeutronModelHandle nmh = NULL;
+  const uint8_t* inputTranspositionFlags;
+  const uint8_t* outputTranspositionFlags;
+} NeutronConfig;
+
+// Applied on outputs.
+template <typename T>
+void transposeToChannelFirst(
+    const T* src,
+    T* dest,
+    size_t N,
+    size_t C,
+    size_t H,
+    size_t W) {
+  for (size_t n = 0; n < N; n++) {
+    for (size_t c = 0; c < C; c++) {
+      for (size_t h = 0; h < H; h++) {
+        for (size_t w = 0; w < W; w++) {
+          dest[n * C * H * W + c * H * W + h * W + w] =
+              src[n * H * W * C + h * W * C + w * C + c];
+        }
+      }
+    }
+  }
+}
+
+// Applied on inputs.
+template <typename T>
+void transposeToChannelLast(
+    const T* src,
+    T* dest,
+    size_t N,
+    size_t C,
+    size_t H,
+    size_t W) {
+  for (size_t n = 0; n < N; n++) {
+    for (size_t c = 0; c < C; c++) {
+      for (size_t h = 0; h < H; h++) {
+        for (size_t w = 0; w < W; w++) {
+          dest[n * H * W * C + h * W * C + w * C + c] =
+              src[n * C * H * W + c * H * W + h * W + w];
+        }
+      }
+    }
+  }
+}
+
+// Transpose src buffer in channel first format into dest buffer in channel last
+// format, sizes correspond to src dimensions in the Executorch defined tensor
+// (which is NCHW), element_size is in Bytes.
+void transposeInput(
+    const void* src,
+    void* dest,
+    const ArrayRef<exec_aten::SizesType>& sizes,
+    size_t element_size) {
+  size_t length = sizes.size();
+  if (length < 3) {
+    return;
+  }
+  size_t N = 1;
+  size_t C = sizes[length - 3];
+  size_t H = sizes[length - 2];
+  size_t W = sizes[length - 1];
+  for (size_t i = 0; i < length - 3; i++) {
+    N *= sizes[i];
+  }
+  switch (element_size) {
+    case 1:
+      return transposeToChannelLast<uint8_t>(
+          static_cast<const uint8_t*>(src),
+          static_cast<uint8_t*>(dest),
+          N,
+          C,
+          H,
+          W);
+    case 2:
+      return transposeToChannelLast<uint16_t>(
+          static_cast<const uint16_t*>(src),
+          static_cast<uint16_t*>(dest),
+          N,
+          C,
+          H,
+          W);
+    case 4:
+      return transposeToChannelLast<uint32_t>(
+          static_cast<const uint32_t*>(src),
+          static_cast<uint32_t*>(dest),
+          N,
+          C,
+          H,
+          W);
+    case 8:
+      return transposeToChannelLast<uint64_t>(
+          static_cast<const uint64_t*>(src),
+          static_cast<uint64_t*>(dest),
+          N,
+          C,
+          H,
+          W);
+  }
+}
+
+// Transpose src buffer in channel last format into dest buffer in channel first
+// format, sizes correspond to dest dimensions in the Executorch defined tensor
+// (which is NCHW), element_size is in Bytes.
+void transposeOutput(
+    const void* src,
+    void* dest,
+    const ArrayRef<exec_aten::SizesType>& sizes,
+    size_t element_size) {
+  size_t length = sizes.size();
+  if (length < 3) {
+    return;
+  }
+  size_t N = 1;
+  size_t C = sizes[length - 3];
+  size_t H = sizes[length - 2];
+  size_t W = sizes[length - 1];
+  for (size_t i = 0; i < length - 3; i++) {
+    N *= sizes[i];
+  }
+  switch (element_size) {
+    case 1:
+      return transposeToChannelFirst<uint8_t>(
+          static_cast<const uint8_t*>(src),
+          static_cast<uint8_t*>(dest),
+          N,
+          C,
+          H,
+          W);
+    case 2:
+      return transposeToChannelFirst<uint16_t>(
+          static_cast<const uint16_t*>(src),
+          static_cast<uint16_t*>(dest),
+          N,
+          C,
+          H,
+          W);
+    case 4:
+      return transposeToChannelFirst<uint32_t>(
+          static_cast<const uint32_t*>(src),
+          static_cast<uint32_t*>(dest),
+          N,
+          C,
+          H,
+          W);
+    case 8:
+      return transposeToChannelFirst<uint64_t>(
+          static_cast<const uint64_t*>(src),
+          static_cast<uint64_t*>(dest),
+          N,
+          C,
+          H,
+          W);
+  }
+}
+
+class NeutronBackend final : public PyTorchBackendInterface {
+ public:
+  NeutronBackend() {}
+
+  ~NeutronBackend() = default;
+
+  virtual bool is_available() const override {
+    return true;
+  }
+
+  Result<DelegateHandle*> init(
+      BackendInitContext& context,
+      FreeableBuffer* processed,
+      ArrayRef<CompileSpec> compile_specs) const override {
+    MemoryAllocator* allocator = context.get_runtime_allocator();
+
+    auto* cfg = allocator->allocateInstance<NeutronConfig>();
+
+    // The following data is read from the "processed" data blob.
+    //    cfg->numInputs
+    //    cfg->numoutputs
+    //    cfg->mcfg.microcode
+    //    cfg->mcfg.weights
+    //    cfg->mcfg.kernels
+    const uint8_t* transpositionFlags =
+        static_cast<const uint8_t*>(processed->data());
+    int numInputs = transpositionFlags[INPUT_TENSOR_FORMAT_LEN_POS];
+    int numOutputs = transpositionFlags[OUTPUT_TENSOR_FORMAT_LEN_POS];
+    cfg->inputTranspositionFlags =
+        INPUT_TENSOR_FORMAT_ARRAY_ADDR(transpositionFlags);
+    cfg->outputTranspositionFlags =
+        OUTPUT_TENSOR_FORMAT_ARRAY_ADDR(transpositionFlags);
+
+    const uint32_t* buffer = static_cast<const uint32_t*>(
+        static_cast<const void*> PAYLOAD_ADDR(transpositionFlags));
+    uint32_t magicWord = buffer[0];
+    // Check valid microcode.
+    if (magicWord != 0x64434D6E) {
+      ET_LOG(
+          Error,
+          "Preprocessed buffer does not contain a valid Neutron microcode");
+      return Error::InvalidProgram;
+    }
+    uint32_t microcodeSize = buffer[6];
+    uint32_t weightsSize = buffer[7];
+    cfg->scratchSize = buffer[9];
+    cfg->numInputs = buffer[11];
+    cfg->numOutputs = buffer[12];
+    if (cfg->numInputs != numInputs) {
+      ET_LOG(
+          Error,
+          "Preprocessed buffer does not contain a valid number of inputs");
+      return Error::InvalidProgram;
+    }
+    if (cfg->numOutputs != numOutputs) {
+      ET_LOG(
+          Error,
+          "Preprocessed buffer does not contain a valid number of outputs");
+      return Error::InvalidProgram;
+    }
+    cfg->mcfg.microcode =
+        static_cast<const uint8_t*>(static_cast<const void*>(buffer));
+    cfg->mcfg.weights = static_cast<const uint8_t*>(cfg->mcfg.microcode) +
+        ALIGN_SIZE(microcodeSize);
+    cfg->mcfg.kernels = static_cast<const uint8_t*>(cfg->mcfg.weights) +
+        ALIGN_SIZE(weightsSize);
+
+#if (NO_HEAP_USAGE == 0)
+    // The driver allocates and deallocates place for NeutronModelHandle.
+    cfg->nmh = NULL;
+#else
+    // Allocate place for NeutronModelHandle.
+    cfg->nmh = static_cast<NeutronModelHandle>(
+        allocator->allocate(neutronGetModelContextSize()));
+#endif
+
+    // Prepare data for through neutron driver.
+    NeutronError neutronRC =
+        neutronModelPrepare((const NeutronModelConfig*)&cfg->mcfg, &cfg->nmh);
+    if (neutronRC != ENONE) {
+      ET_LOG(
+          Error,
+          "Neutron model preparation failed with error code %ld",
+          neutronRC);
+      return Error::InvalidProgram;
+    }
+
+    return cfg;
+  }
+
+  Error execute(
+      BackendExecutionContext& context,
+      DelegateHandle* input_handle,
+      EValue** args) const override {
+    NeutronConfig* cfg = static_cast<NeutronConfig*>(input_handle);
+
+    // Allocate place for input and output pointers.
+    cfg->dcfg.inputs = static_cast<const void**>(
+        context.allocate(cfg->numInputs * sizeof(void*)));
+    cfg->dcfg.outputs =
+        static_cast<void**>(context.allocate(cfg->numOutputs * sizeof(void*)));
+    cfg->dcfg.outputs[cfg->numOutputs] =
+        static_cast<void*>(context.allocate(cfg->scratchSize, 16));
+
+    // Set inputs and outputs from args.
+    for (int i = 0; i < cfg->numInputs; i++) {
+      cfg->dcfg.inputs[i] = args[i]->toTensor().const_data_ptr();
+    }
+    for (int i = 0; i < cfg->numOutputs; i++) {
+      cfg->dcfg.outputs[i] =
+          args[cfg->numInputs + i]->toTensor().mutable_data_ptr();
+    }
+
+    // Transpose inputs.
+    for (int i = 0; i < cfg->numInputs; i++) {
+      if (cfg->inputTranspositionFlags[i]) {
+        if (args[i]->toTensor().sizes().size() < 3) {
+          ET_LOG(Error, "Unable to transpose 1D and 2D input to channel last");
+          return Error::InvalidProgram;
+        }
+        // Allocate buffer, the allocator is reset after each PTE instruction.
+        void* buffer = context.allocate(args[i]->toTensor().nbytes(), 16);
+        transposeInput(
+            args[i]->toTensor().const_data_ptr(),
+            buffer,
+            args[i]->toTensor().sizes(),
+            args[i]->toTensor().element_size());
+        cfg->dcfg.inputs[i] = buffer;
+      }
+    }
+    // Redirect outputs.
+    for (int i = 0; i < cfg->numOutputs; i++) {
+      if (cfg->outputTranspositionFlags[i]) {
+        // Allocate buffer, the allocator is reset after each PTE instruction.
+        void* buffer =
+            context.allocate(args[cfg->numInputs + i]->toTensor().nbytes(), 16);
+        cfg->dcfg.outputs[i] = buffer;
+      }
+    }
+
+#ifdef NEUTRON_PROFILE
+    // TODO: Use trace from BackendExecutionContext.
+    NeutronTraceConfig trace_config{.traceConfig = 0};
+    neutronSetTrace(cfg->nmh, &trace_config);
+#endif
+
+    // Run neutron compute.
+    NeutronError neutronRC = neutronRunBlocking(cfg->nmh, &cfg->dcfg);
+    if (neutronRC != ENONE) {
+      ET_LOG(
+          Error,
+          "Neutron model evaluation failed with error code %ld",
+          neutronRC);
+      return Error::InvalidProgram;
+    }
+
+    // Transpose outputs.
+    for (int i = 0; i < cfg->numOutputs; i++) {
+      if (cfg->outputTranspositionFlags[i]) {
+        if (args[cfg->numInputs + i]->toTensor().sizes().size() < 3) {
+          ET_LOG(
+              Error, "Unable to transpose 1D and 2D output to channel first");
+          return Error::InvalidProgram;
+        }
+        transposeOutput(
+            cfg->dcfg.outputs[i],
+            args[cfg->numInputs + i]->toTensor().mutable_data_ptr(),
+            args[cfg->numInputs + i]->toTensor().sizes(),
+            args[cfg->numInputs + i]->toTensor().element_size());
+      }
+    }
+
+    return Error::Ok;
+  }
+
+  void destroy(DelegateHandle* handle) const override {
+    NeutronConfig* cfg = reinterpret_cast<NeutronConfig*>(handle);
+
+    // Unprepare to free resources in neutron driver.
+    NeutronError neutronRC = neutronModelUnprepare(cfg->nmh);
+    (void)neutronRC;
+
+    // Deallocation is done automatically.
+    /*
+    delete[] cfg->dcfg.inputs;
+    delete[] cfg->dcfg.outputs;
+    delete cfg;
+    */
+    return;
+  }
+};
+
+namespace {
+auto backend = NeutronBackend();
+Backend backend_id{"NeutronBackend", &backend};
+static auto registered = register_backend(backend_id);
+} // namespace
+
+} // namespace neutron
+} // namespace executor
+} // namespace torch
diff --git a/backends/nxp/runtime/NeutronDriver.h b/backends/nxp/runtime/NeutronDriver.h
@@ -0,0 +1,252 @@
+/*
+ * Copyright 2022-2024 NXP
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Interface for the NXP Neutron NPU driver.
+ */
+
+#ifndef NEUTRON_DRIVER_H
+#define NEUTRON_DRIVER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "NeutronErrors.h"
+
+/* Neutron Driver error category codes */
+typedef enum ERROR_CATEGORY_DRIVER {
+  ERROR_CATEGORY_DRIVER_GENERIC, /* Generic error category */
+  ERROR_CATEGORY_DRIVER_UNSUPPORTED, /* Unsupported function */
+  ERROR_CATEGORY_DRIVER_UCODE, /* Microcode bad magic or version incompatible.
+                                */
+  ERROR_CATEGORY_DRIVER_INVALID, /* Invalid arguments */
+  ERROR_CATEGORY_DRIVER_BAD_HANDLE, /* Bad inference handle */
+  ERROR_CATEGORY_DRIVER_NO_MEMORY, /* Not enough memory */
+  ERROR_CATEGORY_DRIVER_INTERNAL_FAULT, /* Internal error */
+  ERROR_CATEGORY_DRIVER_UNKNOWN_ARCH, /* Unknown architecture */
+  ERROR_CATEGORY_DRIVER_TRACE_NOT_RUN, /* Tracing did not run, but trace buffer
+                                          was requested. */
+  ERROR_CATEGORY_DRIVER_TIMEOUT /* Timeout error. */
+} ERROR_CATEGORY_DRIVER;
+
+/// Trace configuration to enable kernel level tracing.
+#define TRACE_CONFIG_KERNEL_LEVEL (1U << 0)
+
+/// Trace confinguration to enable job level tracing.
+#define TRACE_CONFIG_JOB_LEVEL (1U << 1)
+
+// Macro to define where to allocate memory for NeutronCtx
+#ifndef NO_HEAP_USAGE
+#define NO_HEAP_USAGE 0
+#endif
+
+/* Neutron Driver errors */
+#define GEN_NEUTRON_DRIVER_ERROR(category, code) \
+  GEN_NEUTRON_ERROR(ERROR_COMPONENT_DRIVER, category, code)
+#define GEN_NEUTRON_DRIVER_GENERIC_ERROR() \
+  GEN_NEUTRON_DRIVER_ERROR(ERROR_CATEGORY_DRIVER_GENERIC, __LINE__)
+
+/// Type definition for a Neutron model handle. This is an identifier used to
+/// uniquely identify a model. The convention is that the value
+/// NEUTRON_INVALID_HANDLE handle corresponds to an invalid handle.
+typedef void* NeutronModelHandle;
+
+typedef struct {
+  /// Neutron microcode buffer address.
+  /// The Neutron microcode is generated by the Neutron converter tool.
+  /// The microcode buffer, 16 bytes aligned, is allocated and initialized by
+  /// the application or ML framework. The microcode buffer is passed by
+  /// reference to the Neutron firmware. The microcode buffer is specific for a
+  /// given ML model.
+  const void* microcode;
+
+  /// Neutron weights buffer address.
+  /// The Neutron weights is generated by the Neutron converter tool.
+  /// The weights buffer, 16 bytes aligned, is allocated and initialized by the
+  /// application or ML framework. The weights buffer address is passed by
+  /// reference to the Neutron-firmware. The weights buffer is specific for a
+  /// given ML model.
+  const void* weights;
+
+  /// Neutron kernels buffer address.
+  /// The Neutron kernels are generated by the Neutron converter tool.
+  /// The kernels buffer, 16 bytes aligned, is allocated and initialized by the
+  /// application or ML framework. The kernels buffer address is passed by
+  /// reference to the Neutron-firmware. The kernels buffer is specific for a
+  /// given ML model.
+  const void* kernels;
+
+  /// Timeout seconds for the microcode running.
+  /// This timeout is the uplimit seconds that a user expect to complete,
+  /// default 60.
+  uint32_t timeoutSeconds;
+
+} NeutronModelConfig;
+
+typedef struct {
+  /// The input buffers of the model.
+  /// The input buffers are allocated and initialized by the application or ML
+  /// framework. The input buffers are passed by reference to the Neutron
+  /// firmware.
+  const void** inputs;
+
+  /// The output buffers of the model.
+  /// The output buffers are allocated by the application or ML framework.
+  /// The output buffers are passed by reference to the Neutron firmware.
+  void** outputs;
+
+  /// Scratch buffer required for computing model intermediate results.
+  /// If NULL, this buffer has to be allocated by the driver.
+  void* scratch;
+
+  /// Scratch buffer required for prefetching model weights from FLASH to SRAM.
+  /// This buffer is used only for Neutron-C targets when the weight prefetch
+  /// option was explicitly used. If NULL, this buffer has to be allocated by
+  /// the driver.
+  void* scratchWeights;
+
+} NeutronDataConfig;
+
+typedef struct {
+  /// Sets whether tracing should be executed during firmware run or not.
+  /// If set to 0, tracing will not run.
+  /// If set to 1 - kernel level tracing.
+  /// If set to 2 - job level tracing.
+  /// If set to 3 - mixed level tracing
+  uint32_t traceConfig;
+
+  /// Buffer to store collected trace data.
+  /// If it is NULLPTR, driver will allocate the memory, otherwise, application
+  /// can.
+  char* traceBuffer;
+
+  /// What is the allocated memory for buffer. Needed to check if appending
+  /// string will be out of bounds. Application should set this, if the buffer
+  /// is allocated by application, otherwise driver will set the value.
+  size_t traceBufferSize;
+} NeutronTraceConfig;
+
+/// This structure contains the prototypes for functions that have a custom
+/// implementation. Any new functions or variables must be added at the end.
+typedef struct {
+  /// This function performs the copying from FLASH to SRAM.
+  void (*copy)(void* dst, void* src, uint32_t size, uint32_t channel);
+  /// This is a blocking function that checks if the current copy has finished.
+  void (*wait)(uint32_t channel);
+} NeutronConfig;
+
+/* Invalid handle, returned by neutronModelPrepare() if an error occurred. */
+#define NEUTRON_INVALID_HANDLE NULL
+
+/// - Initialize the Neutron Driver library, setting initial values, do memory
+/// allocation
+///   for internal data structures, do memory mapping.
+NeutronError neutronInit();
+
+/// - Deinitialize the Neutron Driver library, releasing any resources aquired
+/// by neutronInit
+NeutronError neutronDeinit();
+
+/// - Prepare Neutron execution for a model with custom firmware.
+/// - This function is only available for Neutron-S.
+NeutronError neutronCustomPrepare(
+    uint32_t* inputSize,
+    int32_t numInputs,
+    uint32_t* outputSize,
+    int32_t numOutputs,
+    const void* firmware,
+    size_t firmwareSize,
+    NeutronModelHandle* hdl);
+
+/// - Run Neutron custom firmware and get the results.
+/// - This function is only available for Neutron-S.
+NeutronError neutronCustomExec(
+    NeutronModelHandle hdl,
+    const NeutronDataConfig* neutron_dcfg);
+
+/// - Prepare Neutron execution for a model with the given configuration.
+/// - This function only prepares the execution by transferring the parameters
+/// to the firmware.
+/// - This function allows caching a model and then running the same model but
+/// with different
+///   input data (assuming the new input data replaces the old input data by
+///   reusing the same buffers).
+/// - In case external allocated memory shall be used for the ModelHandle, e.g.
+/// from the Tensorflow
+///   tensor arena, hdl shall be a pointer to the start of the allocated memory
+///   block.
+//    If a pointer to NULL is passed, memory will be allocated by the driver
+///   from HEAP. If no HEAP is available, an error will be thrown.
+NeutronError neutronModelPrepare(
+    const NeutronModelConfig* mcfg,
+    NeutronModelHandle* hdl);
+
+/// - Unprepare Neutron execution handle.
+/// - This function releases the internal context data structures and the
+/// reserved handle.
+NeutronError neutronModelUnprepare(NeutronModelHandle hdl);
+
+/// - Perform Neutron execution in blocking mode.
+NeutronError neutronRunBlocking(
+    NeutronModelHandle hdl,
+    const NeutronDataConfig* dcfg);
+
+/// - Perform Neutron execution in non-blocking mode.
+/// - This functionality is only available for Neutron-S.
+NeutronError neutronRunNonBlocking(
+    NeutronModelHandle hdl,
+    const NeutronDataConfig* dcfg);
+
+/// - Wait (block) for Neutron completion.
+/// - This functionality is only available for Neutron-S.
+NeutronError neutronWait(NeutronModelHandle hdl, const NeutronDataConfig* dcfg);
+
+/// - Query if the job is done by Neutron.
+/// - This functionality is only available for neutronRunNonBlocking.
+NeutronError neutronIsReady(NeutronModelHandle hdl, bool* isReady);
+
+#ifndef NDEBUG
+/// - Set tracing information.
+void neutronSetTrace(NeutronModelHandle hdl, NeutronTraceConfig* tcfg);
+
+/// - Get tracing result to buffer.
+NeutronError
+neutronGetTrace(NeutronModelHandle hdl, char** buffer, size_t* size);
+#endif
+
+/// - Perform power management to suspend Neutron hardware.
+//  - This function disables the clock for Neutron.
+NeutronError neutronSuspend();
+
+/// - Perform power management to resume Neutron hardware.
+//  - This function enables the clock for Neutron.
+NeutronError neutronResume();
+
+/// - Used to initialize custom API's or variables implemented by external
+/// application.
+NeutronError neutronSetConfig(NeutronConfig* config);
+
+/// - Used to get NeutronContext size.
+size_t neutronGetModelContextSize();
+
+/// - Allocates size bytes and returns a pointer to the allocated memory.
+///   The returned pointer address will be a multiple of the alignment.
+///   Returns NULL on failure.
+/// - alignment: Set to 0 if unsure of alignment requirements.
+/// - This function is only available for Neutron-S in the Linux environment.
+void* neutronMemAlloc(size_t alignment, size_t size);
+
+/// - Frees the memory buffer pointed to by ptr.
+/// - This function is only available for Neutron-S in the Linux environment.
+void neutronMemFree(void* ptr);
+
+/// Other functions to control the state of driver/firmware.
+#ifdef __cplusplus
+}
+#endif
+#endif // NEUTRON_DRIVER_H
diff --git a/backends/nxp/runtime/NeutronErrors.h b/backends/nxp/runtime/NeutronErrors.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2022-2024 NXP
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Definition of the NXP Neutron NPU driver errors.
+ */
+
+#ifndef NEUTRON_ERRORS_H
+#define NEUTRON_ERRORS_H
+
+#include <stdint.h>
+
+typedef int32_t NeutronError;
+
+/*
+    Generate error code.
+    A code is composed of (from least to most significant bit):
+        3 bits = component id
+        5 bits = category id
+        23 bits = code
+        1 bit = sign
+*/
+#define GEN_NEUTRON_ERROR(component, category, code)                   \
+  ((NeutronError)(((component & 0xF) << 0) | ((category & 0xF) << 3) | \
+                  ((code & 0x7FFFFF) << 8)))
+
+#define ENONE 0
+
+#define GET_ERROR_COMPONENT(e) ((e >> 0) & 0x00000007)
+#define GET_ERROR_CATEGORY(e) ((e >> 3) & 0x0000001F)
+#define GET_ERROR_CODE(e) ((e >> 8) & 0x007FFFFF)
+
+/* Components ids*/
+// DO NOT USE 0x0 as component magic number!
+typedef enum ERROR_COMPONENT_ID {
+  ERROR_COMPONENT_LIBRARY = 0x1,
+  ERROR_COMPONENT_FIRMWARE = 0x2,
+  ERROR_COMPONENT_DRIVER = 0x3
+} ERROR_COMPONENT_ID;
+
+/// Retrieve component name as string from NeutronError code.
+char* getNeutronErrorComponent(NeutronError ne);
+
+/// Retrieve catefory as string from NeutronError code.
+char* getNeutronErrorCategory(NeutronError ne);
+
+#endif // NEUTRON_ERRORS_H
@@ -10,7 +10,6 @@
 #include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
 #include <executorch/kernels/portable/cpu/util/repeat_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
-#include <sys/types.h>
 
 #include <cstring>