diff --git a/CMakeLists.txt b/CMakeLists.txt
index 735844f2..17d5295f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,10 +37,11 @@ function(get_linux_lsb_release_information)
     set(LSB_RELEASE_VERSION "${LSB_RELEASE_VERSION}" PARENT_SCOPE)
 endfunction()
 
+
 set(OV_VERSION_SHORT "2024.4")
 set(OV_VERSION "2024.4.0.16579.c3152d32c9c_x86_64")
 set(OV_STORAGE_URL "https://storage.openvinotoolkit.org/repositories/openvino/packages")
-set(OV_NIGHTLY_COMMIT "2024.3.0-15502-66093834e38")
+set(OV_NIGHTLY_COMMIT "2024.4.0-16039-620d2a20c8c")
 
 if (WIN32)
     if(NOT OV_LIBRARY_URL)
diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt
index 1db5687b..40c28dcd 100644
--- a/examples/cpp/CMakeLists.txt
+++ b/examples/cpp/CMakeLists.txt
@@ -19,8 +19,7 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
 
 FetchContent_Declare(
         intel_npu_acceleration_library
-        GIT_REPOSITORY "https://github.com/intel/intel-npu-acceleration-library"
-        GIT_TAG "main"
+        SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../"
 )
 FetchContent_MakeAvailable(intel_npu_acceleration_library)
 
diff --git a/examples/cpp/main.cpp b/examples/cpp/main.cpp
index 2946105a..ab13f628 100644
--- a/examples/cpp/main.cpp
+++ b/examples/cpp/main.cpp
@@ -9,7 +9,7 @@ using namespace intel_npu_acceleration_library;
 #include <iostream>
 
 int main() {
-    const size_t batch = 128, inC = 256, outC = 512, N = 100000;
+    const size_t batch = 128, inC = 256, outC = 512, N = 10000;
 
     std::cout << "Create a ModelFactory" << std::endl;
     auto factory = std::make_shared<ModelFactory>("NPU");
@@ -28,19 +28,19 @@ int main() {
     factory->compile();
 
     // Save OV model
-    std::cout << "Saving model to matmul.xml" << std::endl;
-    factory->saveModel("matmul.xml");
+    // std::cout << "Saving model to matmul.xml" << std::endl;
+    // factory->saveModel("matmul.xml");
 
-    // Here you can create float16 buffers and run inference by using
-    half_ptr input_buffer = new uint16_t[batch * inC];
-    half_ptr weights_buffer = new uint16_t[outC * inC];
-    half_ptr bias_buffer = new uint16_t[outC];
-    half_ptr output_buffer = new uint16_t[batch * outC];
+    std::cout << "Creating a remote tensor" << std::endl;
+    auto input_buffer = factory->createRemoteInputTensor(0);
+    auto weights_buffer = factory->createRemoteInputTensor(1);
+    auto bias_buffer = factory->createRemoteInputTensor(2);
+    auto output_buffer = factory->createRemoteOutputTensor(0);
 
-    memset(input_buffer, 0, batch * inC * sizeof(uint16_t));
-    memset(weights_buffer, 0, outC * inC * sizeof(uint16_t));
-    memset(output_buffer, 0, batch * outC * sizeof(uint16_t));
-    memset(bias_buffer, 0, outC * sizeof(uint16_t));
+    std::memset(input_buffer.get(), 0, input_buffer.get_byte_size());
+    std::memset(weights_buffer.get(), 0, weights_buffer.get_byte_size());
+    std::memset(bias_buffer.get(), 0, bias_buffer.get_byte_size());
+    std::memset(output_buffer.get(), 0, output_buffer.get_byte_size());
 
     factory->setInputTensor(input_buffer, 0);
     factory->setInputTensor(weights_buffer, 1);
@@ -49,13 +49,10 @@ int main() {
 
     // Run inference
     std::cout << "Run inference on " << N << " workloads" << std::endl;
-    for (auto idx = 0; idx < N; idx++)
+    for (auto idx = 0; idx < N; idx++) {
         factory->run();
-    std::cout << "Inference done" << std::endl;
+    }
 
-    delete[] input_buffer;
-    delete[] weights_buffer;
-    delete[] bias_buffer;
-    delete[] output_buffer;
+    std::cout << "Inference done" << std::endl;
     return 0;
 }
\ No newline at end of file
diff --git a/include/intel_npu_acceleration_library/common.h b/include/intel_npu_acceleration_library/common.h
index cba90b8f..22ce8374 100644
--- a/include/intel_npu_acceleration_library/common.h
+++ b/include/intel_npu_acceleration_library/common.h
@@ -13,6 +13,7 @@
 #include "openvino/opsets/opset7.hpp"
 #include "openvino/opsets/opset8.hpp"
 #include "openvino/opsets/opset9.hpp"
+#include "openvino/runtime/intel_npu/level_zero/level_zero.hpp"
 #include "openvino/runtime/intel_npu/properties.hpp"
 
 #if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__)
@@ -23,6 +24,12 @@
 
 namespace intel_npu_acceleration_library {
 
+/**
+ * @brief OpenVINO core object
+ *
+ */
+ov::Core core;
+
 static constexpr ov::Property<std::string> npu_compiler_type{"NPU_COMPILER_TYPE"};
 static constexpr ov::Property<std::string> npu_parameters{"NPU_COMPILATION_MODE_PARAMS"};
 
diff --git a/include/intel_npu_acceleration_library/inference.h b/include/intel_npu_acceleration_library/inference.h
index 92321657..15331b72 100644
--- a/include/intel_npu_acceleration_library/inference.h
+++ b/include/intel_npu_acceleration_library/inference.h
@@ -19,15 +19,10 @@
 #include <vector>
 #include "intel_npu_acceleration_library/common.h"
 #include "intel_npu_acceleration_library/parameters.h"
+#include "intel_npu_acceleration_library/tensor.h"
 
 namespace intel_npu_acceleration_library {
 
-/**
- * @brief OpenVINO core object
- *
- */
-static ov::Core core;
-
 /**
  * @brief Create a remote tensor
  *
@@ -95,8 +90,6 @@ class OVInferenceModel {
         compiled_model = core.compile_model(model, device);
         // Create inference request
         infer_request = compiled_model.create_infer_request();
-        // First inference
-        infer_request.infer();
     }
 
     /**
@@ -126,6 +119,14 @@ class OVInferenceModel {
             wt_thread.join();
     }
 
+    /**
+     * @brief Get the remote context
+     *
+     */
+    auto get_context() {
+        return core.get_default_context(device).as<ov::intel_npu::level_zero::ZeroContext>();
+    }
+
     /**
      * @brief Save the model to a local path
      *
@@ -167,6 +168,42 @@ class OVInferenceModel {
         }
     }
 
+    /**
+     * @brief Create a Remote Tensor object
+     *
+     * @param type element type
+     * @param shape element shape
+     * @param tensor_type element tensor type: INPUT, OUTPUT, BIND
+     * @return auto
+     */
+    auto createRemoteTensor(const ov::element::Type type, const ov::Shape& shape,
+                            const ov::intel_npu::TensorType tensor_type) {
+        ov::intel_npu::level_zero::ZeroContext context = get_context();
+        return context.create_l0_host_tensor(type, shape, tensor_type);
+    }
+
+    /**
+     * @brief Create a Remote Tensor object
+     *
+     * @param idx index of the input tensor
+     * @return auto
+     */
+    auto createRemoteInputTensor(size_t idx) {
+        auto tensor = infer_request.get_input_tensor(idx);
+        return createRemoteTensor(tensor.get_element_type(), tensor.get_shape(), ov::intel_npu::TensorType::INPUT);
+    }
+
+    /**
+     * @brief Create a Remote Tensor object
+     *
+     * @param idx index of the output tensor
+     * @return auto
+     */
+    auto createRemoteOutputTensor(size_t idx) {
+        auto tensor = infer_request.get_output_tensor(idx);
+        return createRemoteTensor(tensor.get_element_type(), tensor.get_shape(), ov::intel_npu::TensorType::OUTPUT);
+    }
+
     /**
      * @brief Get model input tensor
      *
@@ -201,6 +238,16 @@ class OVInferenceModel {
         infer_request.set_input_tensor(idx, X);
     }
 
+    /**
+     * @brief Set the input activations
+     *
+     * @param _X reference to a zero buffer tensor
+     * @param idx input tensor index
+     */
+    void setInputTensor(ov::intel_npu::level_zero::ZeroBufferTensor& _X, size_t idx) {
+        infer_request.set_input_tensor(idx, _X);
+    }
+
     /**
      * @brief Set the output activations
      *
@@ -213,6 +260,16 @@ class OVInferenceModel {
         infer_request.set_output_tensor(idx, X);
     }
 
+    /**
+     * @brief Set the output activations
+     *
+     * @param _X reference to a zero buffer tensor
+     * @param idx output tensor index
+     */
+    void setOutputTensor(ov::intel_npu::level_zero::ZeroBufferTensor& _X, size_t idx) {
+        infer_request.set_output_tensor(idx, _X);
+    }
+
     /**
      * @brief Set the input and output activations
      *
diff --git a/include/intel_npu_acceleration_library/tensor.h b/include/intel_npu_acceleration_library/tensor.h
new file mode 100644
index 00000000..70f5523a
--- /dev/null
+++ b/include/intel_npu_acceleration_library/tensor.h
@@ -0,0 +1,52 @@
+//
+// Copyright © 2024 Intel Corporation
+// SPDX-License-Identifier: Apache 2.0
+//
+
+#include "intel_npu_acceleration_library/common.h"
+
+namespace intel_npu_acceleration_library {
+
+/**
+ * @brief Class representing a NPU tensor
+ *
+ */
+class Tensor {
+private:
+    ov::intel_npu::level_zero::ZeroBufferTensor _remote_tensor;
+    void* data_ptr;
+
+public:
+    /**
+     * @brief Construct a new Tensor object
+     *
+     * @param dtype tensor datatype
+     * @param shape tensor shape
+     * @param data pointer to tensor data
+     * @param tensor_type tensor type. Choices between INPUT, OUTPUT, BINDED
+     * @param device target device for the tensor
+     */
+    Tensor(ov::element::Type_t dtype, ov::Shape shape, void* data,
+           ov::intel_npu::TensorType tensor_type = ov::intel_npu::TensorType::INPUT, std::string device = "NPU") {
+        if (!_isNPUAvailable(core)) {
+            // Cannot create NPU remote tensor... use the same pointer as before
+            data_ptr = data;
+        } else {
+            auto context = core.get_default_context(device).as<ov::intel_npu::level_zero::ZeroContext>();
+            _remote_tensor = context.create_l0_host_tensor(dtype, shape, tensor_type);
+            data_ptr = _remote_tensor.get();
+            std::memcpy(data_ptr, data, _remote_tensor.get_byte_size());
+        }
+    }
+
+    /**
+     * @brief Get the data pointer
+     *
+     * @return void*
+     */
+    void* data() {
+        return data_ptr;
+    }
+};
+
+}  // namespace intel_npu_acceleration_library
\ No newline at end of file
diff --git a/intel_npu_acceleration_library/backend/bindings.py b/intel_npu_acceleration_library/backend/bindings.py
index 9e17fa9a..587700bc 100644
--- a/intel_npu_acceleration_library/backend/bindings.py
+++ b/intel_npu_acceleration_library/backend/bindings.py
@@ -88,6 +88,15 @@ def init_common(lib: ctypes.CDLL):
 
     lib.compressToI4.argtypes = [c_i8_array, c_u8_array, ctypes.c_int]
 
+    # Remote tensors
+    lib.to_npu.argtypes = [ctypes.c_int, c_u32_array, ctypes.c_char_p, ctypes.c_void_p]
+    lib.to_npu.restype = handler
+
+    lib.remote_tensor_data.argtypes = [handler]
+    lib.remote_tensor_data.restype = ctypes.c_void_p
+
+    lib.del_remote_tensor.argtypes = [handler]
+
 
 def init_network_factory(lib: ctypes.CDLL):
     """Initialize Netowrk factory bindings.
diff --git a/intel_npu_acceleration_library/backend/factory.py b/intel_npu_acceleration_library/backend/factory.py
index 48108dff..db0e1d11 100644
--- a/intel_npu_acceleration_library/backend/factory.py
+++ b/intel_npu_acceleration_library/backend/factory.py
@@ -7,7 +7,7 @@
 from intel_npu_acceleration_library.backend.ops import get_supported_ops
 from intel_npu_acceleration_library.backend.bindings import lib as backend_lib
 from intel_npu_acceleration_library.backend.tensor import Tensor
-from intel_npu_acceleration_library.dtypes import int4, bfloat16
+from intel_npu_acceleration_library.dtypes import get_backend_dtype
 from typing import Optional, Tuple, Any, Union, Sequence, TypeVar, Callable, cast, List
 from functools import partial
 import numpy.typing as npt
@@ -115,34 +115,10 @@ def get_backend_dtype(self, dtype) -> ctypes.c_char_p:
         Args:
             dtype: numpy dtype
 
-        Raises:
-            RuntimeError: Unsupported datatype
-
         Returns:
             ctypes.c_char_p: string representation of the dtype
         """
-        if dtype in [np.int8, torch.int8]:
-            str_dtype = "int8"
-        elif dtype == np.uint8 or dtype == int4:
-            # u8 represents packed i4 dtypes
-            str_dtype = "int4"
-        elif dtype in [np.int16, torch.int16]:
-            str_dtype = "int16"
-        elif dtype in [np.int32, torch.int32]:
-            str_dtype = "int32"
-        elif dtype in [np.int64, torch.int64]:
-            str_dtype = "int64"
-        elif dtype in [np.float16, torch.float16]:
-            str_dtype = "float16"
-        elif dtype in [np.float32, torch.float32]:
-            str_dtype = "float32"
-        elif dtype in [np.float64, torch.float64]:
-            str_dtype = "float64"
-        elif dtype in [bfloat16, torch.bfloat16]:
-            str_dtype = "bfloat16"
-        else:
-            raise RuntimeError(f"DType is not supported {dtype}")
-        return ctypes.c_char_p(str_dtype.encode())
+        return get_backend_dtype(dtype)
 
     @return_tensor
     def parameter(
diff --git a/intel_npu_acceleration_library/backend/tensor.py b/intel_npu_acceleration_library/backend/tensor.py
index e8cca7fc..08ced138 100644
--- a/intel_npu_acceleration_library/backend/tensor.py
+++ b/intel_npu_acceleration_library/backend/tensor.py
@@ -16,14 +16,90 @@
     int32,
     int64,
     NPUDtype,
+    get_backend_dtype,
 )
 from dataclasses import dataclass
 import functools
+from math import prod
 import numpy as np
 import ctypes
 import torch
 
 
+class RemoteTensor(torch.Tensor):
+    """
+    Represent a remote tensor object.
+
+    Attrs:
+        _remote_tensor (ctypes._Pointer): The pointer to the underlying remote tensor.
+
+    Methods:
+        from_torch(x: torch.Tensor): Create a remote tensor from a torch tensor.
+    """
+
+    _remote_tensor = None
+
+    @staticmethod
+    def __new__(cls, x: Any, remote_tensor: ctypes._Pointer, *args: Any, **kwargs: Any):
+        """
+        Create a new remote tensor object.
+
+        Args:
+            x (Any): tensor input
+            remote_tensor (ctypes._Pointer): remote tensor pointer
+            args (Any): additional arguments
+            kwargs (Any): additional keyword arguments
+
+        Returns:
+            RemoteTensor: a RemoteTensor object
+        """
+        return super().__new__(cls, x, *args, **kwargs)
+
+    def __init__(self, x: Any, remote_tensor: ctypes._Pointer):
+        """
+        Initialize the remote tensor object.
+
+        Args:
+            x (Any): tensor input
+            remote_tensor (ctypes._Pointer): remote tensor pointer
+        """
+        self._remote_tensor = remote_tensor
+
+    # def __del__(self):
+    #     if self._remote_tensor and backend_lib:
+    #         backend_lib.del_remote_tensor(self._remote_tensor)
+
+    @staticmethod
+    def from_torch(x: torch.Tensor) -> "RemoteTensor":
+        """
+        Create a remote tensor from a torch tensor.
+
+        Args:
+            x (torch.Tensor): The torch tensor.
+
+        Returns:
+            RemoteTensor: The remote tensor.
+        """
+        shape_arr = np.array(x.shape, dtype=np.uint32)
+        dtype_str = get_backend_dtype(x.dtype)
+        p = ctypes.cast(x.data_ptr(), ctypes.c_void_p)
+
+        rt = backend_lib.to_npu(shape_arr.size, shape_arr, dtype_str, p)
+
+        pointer = ctypes.cast(
+            backend_lib.remote_tensor_data(rt),
+            ctypes.POINTER(ctypes.c_uint8),
+        )
+
+        arr = (pointer._type_ * prod(x.shape) * x.element_size()).from_address(
+            ctypes.addressof(pointer.contents)
+        )
+
+        pt_tensor = torch.frombuffer(arr, dtype=x.dtype).view(*x.shape)
+
+        return RemoteTensor(pt_tensor, rt)
+
+
 @dataclass
 class Tensor:
     """
diff --git a/intel_npu_acceleration_library/device.py b/intel_npu_acceleration_library/device.py
index f4a934a7..fc8e6b15 100644
--- a/intel_npu_acceleration_library/device.py
+++ b/intel_npu_acceleration_library/device.py
@@ -4,6 +4,7 @@
 #
 
 from intel_npu_acceleration_library.nn.module import convert_to_npu_module
+from intel_npu_acceleration_library.backend.tensor import RemoteTensor
 from torch.overrides import TorchFunctionMode
 from functools import lru_cache
 from typing import Any, MutableMapping
@@ -165,8 +166,7 @@ def to(super_fn: Any, self: Any, *args: Any, **kwargs: Any):
     """
     npu_device, args, kwargs = parse_to_arguments(*args, **kwargs)
     if npu_device:
-        # None for now, once the remote tensor feature lands, it can be converted to a remote tensor
-        pass
+        return super_fn(RemoteTensor.from_torch(self), *args, **kwargs)
     return super_fn(self, *args, **kwargs)
 
 
diff --git a/intel_npu_acceleration_library/dtypes.py b/intel_npu_acceleration_library/dtypes.py
index 8754e2f7..e996809c 100644
--- a/intel_npu_acceleration_library/dtypes.py
+++ b/intel_npu_acceleration_library/dtypes.py
@@ -7,6 +7,7 @@
 from typing import Union
 import numpy as np
 import torch
+import ctypes
 
 
 @dataclass(frozen=True)
@@ -81,6 +82,42 @@ def __repr__(self) -> str:
         return self.name
 
 
+def get_backend_dtype(dtype) -> ctypes.c_char_p:
+    """Get the string representation of the dtype.
+
+    Args:
+        dtype: numpy dtype
+
+    Raises:
+        RuntimeError: Unsupported datatype
+
+    Returns:
+        ctypes.c_char_p: string representation of the dtype
+    """
+    if dtype in [np.int8, torch.int8]:
+        str_dtype = "int8"
+    elif dtype in [np.uint8, int4, torch.uint8]:
+        # u8 represents packed i4 dtypes
+        str_dtype = "int4"
+    elif dtype in [np.int16, torch.int16]:
+        str_dtype = "int16"
+    elif dtype in [np.int32, torch.int32]:
+        str_dtype = "int32"
+    elif dtype in [np.int64, torch.int64]:
+        str_dtype = "int64"
+    elif dtype in [np.float16, torch.float16]:
+        str_dtype = "float16"
+    elif dtype in [np.float32, torch.float32]:
+        str_dtype = "float32"
+    elif dtype in [np.float64, torch.float64]:
+        str_dtype = "float64"
+    elif dtype in [bfloat16, torch.bfloat16]:
+        str_dtype = "bfloat16"
+    else:
+        raise RuntimeError(f"DType is not supported {dtype}")
+    return ctypes.c_char_p(str_dtype.encode())
+
+
 float16 = NPUDtype(
     "fp16",
     16,
diff --git a/intel_npu_acceleration_library/nn/module.py b/intel_npu_acceleration_library/nn/module.py
index e861260d..d4a5ac3b 100644
--- a/intel_npu_acceleration_library/nn/module.py
+++ b/intel_npu_acceleration_library/nn/module.py
@@ -68,25 +68,6 @@ def compute_input_signature(
     return "_".join(signature)
 
 
-def patch_parameters(module: torch.nn.Module, model: NNFactory, recurse: bool = False):
-    """Patch the parameters of a PyTorch module with constants.
-
-    Args:
-        module (torch.nn.Module): The PyTorch module.
-        model (NNFactory): The NNFactory instance.
-        recurse (bool, optional): Recurse over all submodules. Defaults to False.
-    """
-    elements = list(module.named_parameters(recurse=recurse))
-    for name, param in elements:
-        del module._parameters[name]
-        setattr(module, name, model.constant(param.data.detach().numpy()))
-
-    buffers = list(module.named_buffers(recurse=recurse))
-    for name, param in buffers:
-        del module._buffers[name]
-        setattr(module, name, model.constant(param.data.detach().numpy()))
-
-
 def patch_modules(module: torch.nn.Module, model: NNFactory):
     """Patch the modules of a PyTorch module with constants.
 
@@ -98,7 +79,6 @@ def patch_modules(module: torch.nn.Module, model: NNFactory):
     for _, module in modules:
         if isinstance(module, Module):
             module.npu_top_level_module = False
-        # patch_parameters(module, model)
         patch_modules(module, model)
 
 
@@ -230,7 +210,6 @@ def create_kwargs_from_list(
         npu_kwargs = create_kwargs_from_list(kwargs)
 
         patch_modules(self, model)
-        # patch_parameters(self, model)
 
         _ = self.forward(*npu_args, **npu_kwargs)
         model.compile()
diff --git a/src/bindings.cpp b/src/bindings.cpp
index de17afc8..03853395 100644
--- a/src/bindings.cpp
+++ b/src/bindings.cpp
@@ -8,13 +8,30 @@
 extern "C" {
 
 intel_npu_acceleration_library_DLL_API bool isNPUAvailable() {
-    ov::Core core;
-    return intel_npu_acceleration_library::_isNPUAvailable(core);
+    return intel_npu_acceleration_library::_isNPUAvailable(intel_npu_acceleration_library::core);
 }
 
 intel_npu_acceleration_library_DLL_API uint32_t getNPUDriverVersion() {
-    ov::Core core;
-    return intel_npu_acceleration_library::driver_version(core);
+    return intel_npu_acceleration_library::driver_version(intel_npu_acceleration_library::core);
+}
+
+// ######################## Remote Tensors ########################
+
+intel_npu_acceleration_library_DLL_API intel_npu_acceleration_library::Tensor* to_npu(size_t size,
+                                                                                      unsigned int* shape_data,
+                                                                                      char* dtype, void* data) {
+    ov::element::Type_t ov_dtype = intel_npu_acceleration_library::dtype_from_string(std::string(dtype));
+    std::vector<size_t> shape(shape_data, shape_data + size);
+
+    return new intel_npu_acceleration_library::Tensor(ov_dtype, shape, data);
+}
+
+intel_npu_acceleration_library_DLL_API void* remote_tensor_data(intel_npu_acceleration_library::Tensor* rt) {
+    return rt->data();
+}
+
+intel_npu_acceleration_library_DLL_API void del_remote_tensor(intel_npu_acceleration_library::Tensor* rt) {
+    delete rt;
 }
 
 // ######################## Compression ########################