Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion extension/llm/runner/multimodal_decoder_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ class ET_EXPERIMENTAL MultimodalDecoderRunner
&start_pos, {1}, executorch::aten::ScalarType::Long);
// run text model
auto outputs_res = ET_UNWRAP(
module_->execute(kTextModelMethod, {start_pos_tensor, embeddings}));
module_->execute(kTextModelMethod, {embeddings, start_pos_tensor}));

ET_CHECK_MSG(
outputs_res.size() == 1,
Expand Down
20 changes: 8 additions & 12 deletions extension/llm/runner/multimodal_prefiller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,24 +91,20 @@ Result<uint64_t> MultimodalPrefiller::prefill(
}

// 2. Run decoder model for prefill.
// `cache_position` goes from start_pos to start_pos + encoder_output.size(1).
// e.g. if start_pos = 2 and encoder_output.size(1) = 5,
// cache_position_tensor should be [2, 3, 4, 5, 6].

// Get expected shape of cache position tensor, which should be the second
// argument

int64_t seq_len = encoder_output.toTensor().size(1);
if (seq_len == 0) {
ET_LOG(Error, "The encoder returned an empty output.");
return ::executorch::runtime::Error::InvalidState;
}
std::vector<int64_t> cache_positions(seq_len);
for (int64_t i = 0; i < seq_len; ++i) {
cache_positions[i] = start_pos + i;
}
auto cache_position_tensor = ::executorch::extension::from_blob(
cache_positions.data(),
{static_cast<int>(seq_len)},
executorch::aten::ScalarType::Long);
auto cache_position_tensor = ET_UNWRAP(
populate_start_pos_or_cache_position(module_, start_pos, seq_len));

auto prefill_result = module_->execute(
kTextModelMethod, {cache_position_tensor, encoder_output});
kTextModelMethod, {encoder_output, cache_position_tensor});
if (prefill_result.error() != ::executorch::runtime::Error::Ok) {
return prefill_result.error();
}
Expand Down
16 changes: 2 additions & 14 deletions extension/llm/runner/text_decoder_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,20 +53,8 @@ ::executorch::runtime::Result<executorch::aten::Tensor> TextDecoderRunner::step(
auto numel = sizes[0];
std::vector<::executorch::aten::SizesType> sizes_vec = {numel};

TensorPtr start_pos_tensor;
if (numel > 1) {
// If we are here, model is exported with cache_positions, create a tensor
// with the same length as input_ids. Assuming the last dimension is the
// one with the variable token length, for example [1, S] or [1, 1, S]
sizes_vec[sizes_vec.size() - 1] = tokens->numel();
start_pos_tensor = empty(sizes_vec, ::executorch::aten::ScalarType::Long);
torch::executor::native::arange_out_impl(
start_pos, start_pos + tokens->numel(), 1.0, *start_pos_tensor);
} else {
// Assuming model is exported with input_pos, create a tensor with size 1
start_pos_tensor = from_blob(
&start_pos, sizes_vec, ::executorch::aten::ScalarType::Long);
}
auto start_pos_tensor = ET_UNWRAP(populate_start_pos_or_cache_position(
module_, start_pos, tokens->numel()));

std::vector<runtime::EValue> inputs;
auto inputs_res = io_manager_->prepare_decode(tokens, start_pos_tensor);
Expand Down
39 changes: 39 additions & 0 deletions extension/llm/runner/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
*/

#pragma once
#include <executorch/extension/llm/runner/constants.h>
#include <executorch/extension/llm/runner/multimodal_prefiller.h>
#include <executorch/extension/tensor/tensor.h>
#include <executorch/runtime/platform/compiler.h>
#include <stdio.h>
#include <time.h>
Expand Down Expand Up @@ -99,6 +102,42 @@ ET_EXPERIMENTAL size_t inline get_rss_bytes() {
// when this changed.
return 0;
}

// Returns the cache position tensor, which can be either a single start_pos
// (when the text_decoder expects a tensor with size 1 because model will
// populate the cache position tensor underneath), or a populated tensor for
// cache position, for the given start_pos and seq_len.
inline runtime::Result<TensorPtr> populate_start_pos_or_cache_position(
Module* module,
int64_t& start_pos,
int seq_len) {
// Get expected shape of cache position tensor, which should be the second
// argument
auto method_meta = ET_UNWRAP(module->method_meta(kTextModelMethod));
auto second_input_info = ET_UNWRAP(method_meta.input_tensor_meta(1));
auto second_input_sizes = second_input_info.sizes();
auto numel = second_input_sizes[0];

TensorPtr start_pos_tensor;
if (numel > 1) {
// `cache_position` goes from start_pos to start_pos +
// encoder_output.size(1). e.g. if start_pos = 2 and encoder_output.size(1)
// = 5, cache_position_tensor should be [2, 3, 4, 5, 6].
std::vector<int64_t> cache_positions(seq_len);
for (int64_t i = 0; i < seq_len; ++i) {
cache_positions[i] = start_pos + i;
}
return ::executorch::extension::from_blob(
cache_positions.data(),
{static_cast<int>(seq_len)},
executorch::aten::ScalarType::Long);
} else {
// Cache position is size 1.
return ::executorch::extension::from_blob(
&start_pos, {1}, executorch::aten::ScalarType::Long);
}
}

} // namespace llm
} // namespace extension
} // namespace executorch
Expand Down
Loading