Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@ def __init__(

def get_workspace(self, m_max: int, group_size: int):
hidden_size = self.hidden_size
intermediate_size = self.intermediate_size
intermediate_size = self.intermediate_size_per_partition
num_experts = self.expert_size_per_partition

# create workspace
Expand Down Expand Up @@ -564,7 +564,7 @@ def forward_chunk(
# grouped gemm 1
h1 = set_strides(workspace["workspace_1"],
self.expert_size_per_partition, m_max,
self.intermediate_size * 2)
self.intermediate_size_per_partition * 2)

deepgemm_fp8_group_blockwise_gemm(
d=h1,
Expand All @@ -579,9 +579,9 @@ def forward_chunk(
# activation and quantization
act_input_fp8 = set_strides(workspace["workspace_0"],
self.expert_size_per_partition, m_max,
self.intermediate_size)
self.intermediate_size_per_partition)

scale_k = fp8_utils.ceil_div(self.intermediate_size, 128)
scale_k = fp8_utils.ceil_div(self.intermediate_size_per_partition, 128)
scale_k_padded = fp8_utils.align(scale_k, 4)
act_input_sf = set_strides(workspace["workspace_sf"],
self.expert_size_per_partition,
Expand Down
8 changes: 8 additions & 0 deletions tensorrt_llm/_torch/pyexecutor/model_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,15 @@ def get_cuda_graph_warmup_request(batch_size, draft_len):

# Add one dummy request with the maximum possible sequence length.
# The sequence length is limited by both the max_seq_len and the number of available blocks.
# Also, the sequence length is limited by the max_position_embeddings.
token_num = max(1, min(available_tokens, self.max_seq_len - 1))
model_config = self.model.model_config.pretrained_config
max_position_embeddings = getattr(model_config,
'max_position_embeddings',
None)
if max_position_embeddings is not None:
token_num = min(token_num,
max_position_embeddings - draft_len)
max_seq_len_request = kv_cache_manager.add_dummy_requests(
request_ids=[batch_size - 1],
token_nums=[token_num],
Expand Down