Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 23 additions & 3 deletions python/tvm/relax/backend/contrib/cublas.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from tvm.relax.transform import PatternCheckContext

from ..pattern_registry import get_patterns_with_prefix, register_patterns
from ..patterns import make_matmul_pattern
from ..patterns import make_matmul_pattern, make_matmul_dequantize_pattern
from ..utils import has_leaking_intermediate_variables


Expand All @@ -48,6 +48,16 @@ def _check_matmul(context: PatternCheckContext) -> bool:
rhs = context.annotated_expr["rhs"]
matmul_call = context.annotated_expr["root"]

if "scale" in context.annotated_expr and "zp" in context.annotated_expr:
scale = context.annotated_expr["scale"]
zero_point = context.annotated_expr["zp"]
# Only scalar values for scale and zero_point are supported.
if scale.struct_info.ndim != 0 or zero_point.struct_info.ndim != 0:
return False
# Only zero_point == 0.0 is supported.
if zero_point.data.numpy()[()].item() != 0.0:
return False

lhs_dtype = lhs.struct_info.dtype
rhs_dtype = rhs.struct_info.dtype
out_dtype = matmul_call.struct_info.dtype
Expand Down Expand Up @@ -187,11 +197,16 @@ def _check_matmul(context: PatternCheckContext) -> bool:
),
_check_matmul,
),
(
"cublas.matmul_transposed_dequantize",
*make_matmul_dequantize_pattern(transposed_rhs=True),
_check_matmul,
),
]
)


def partition_for_cublas(mod):
def partition_for_cublas(mod, bind_constants=False):
"""
Partition the input module into cuBLAS-supported subgraphs.

Expand All @@ -200,6 +215,9 @@ def partition_for_cublas(mod):
mod: tvm.IRModule
The IRModule to be partitioned.

bind_constants : bool
Whether or not to keep bound constants in the grouped function.

Returns
-------
mod: tvm.IRModule
Expand All @@ -208,4 +226,6 @@ def partition_for_cublas(mod):
"""

patterns = get_patterns_with_prefix("cublas")
return transform.FuseOpsByPattern(patterns, bind_constants=False, annotate_codegen=True)(mod)
return transform.FuseOpsByPattern(
patterns, bind_constants=bind_constants, annotate_codegen=True
)(mod)
40 changes: 40 additions & 0 deletions python/tvm/relax/backend/patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,46 @@ def make_rms_norm_pattern():
return out, annotations


def make_matmul_dequantize_pattern(
transposed_rhs: bool = False,
) -> Tuple[DFPattern, Mapping[str, DFPattern]]:
"""
Create pattern for matrix multiplication and dequantize operation.

Parameters
----------
transposed_rhs: bool
Whether the right hand side of multiplication is transposed.

Returns
-------
pattern: DFPattern
The resulting pattern describing a matrix multiplication.

annotations: Mapping[str, DFPattern]
A mapping from name to sub pattern. It can be used to extract important expressions from
match result, to power the partition check function and codegen.
"""

lhs = wildcard()
rhs = wildcard()
annotations = {"lhs": lhs, "rhs": rhs}

if transposed_rhs:
rhs = is_op("relax.permute_dims")(rhs)

out = is_op("relax.matmul")(lhs, rhs)
annotations["root"] = out

scale = is_const()
zp = is_const()
annotations.update({"scale": scale, "zp": zp})

out = is_op("relax.dequantize")(out, scale, zp)

return out, annotations


def make_attention_rewrite_pattern(
qkv_layout: str, out_layout: str, with_bias: bool, with_cast: bool, with_kv_repeat: bool = False
):
Expand Down
20 changes: 20 additions & 0 deletions src/relax/backend/contrib/cublas/codegen.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
* \brief Implementation of the CUBLAS JSON serializer.
*/
#include <tvm/ir/module.h>
#include <tvm/runtime/builtin_fp16.h>

#include <string>

Expand Down Expand Up @@ -74,6 +75,25 @@ class CublasJSONSerializer : public JSONSerializer {
auto node = std::make_shared<JSONGraphNode>(composite_name, /* name_ */
"kernel", /* op_type_ */
inputs, 1 /* num_outputs_ */);
if (composite_name.find("dequantize") != std::string::npos) {
const CallNode* dequantize_call = backend::GetOpInFunction(fn, "relax.dequantize");
if (dequantize_call->args[1]->IsInstance<ConstantNode>()) {
const auto* const_expr = dequantize_call->args[1].as<ConstantNode>();
auto sinfo = Downcast<TensorStructInfo>(const_expr->struct_info_);
float alpha = 1.0;
if (sinfo->dtype == DataType::Float(16)) {
alpha = __gnu_h2f_ieee(static_cast<uint16_t*>(const_expr->data->data)[0]);
} else {
ICHECK(sinfo->dtype == DataType::Float(32));
alpha = static_cast<float*>(const_expr->data->data)[0];
}

std::vector<std::string> dq_scale = {backend::to_str(alpha)};
std::vector<dmlc::any> dq_scale_attr;
dq_scale_attr.emplace_back(dq_scale);
node->SetAttr("dq_scale", dq_scale_attr);
}
}

const CallNode* root_call = backend::GetOpInFunction(fn, "relax.matmul");
SetCallNodeAttribute(node, root_call);
Expand Down
12 changes: 12 additions & 0 deletions src/relax/backend/contrib/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,18 @@ inline const CallNode* GetOpInFunction(Function f, const std::string& op_name) {
*/
Map<String, IntImm> ExtractArgIdx(String pattern_name, Function f);

/*!
* \brief Converts a numeric value to std::string.
* \param value A numeric value to convert.
* \return String representation of a numeric value.
*/
template <typename Type>
std::string to_str(const Type& value) {
std::ostringstream os;
os << std::setprecision(12) << value;
return os.str();
}

} // namespace backend
} // namespace relax
} // namespace tvm
Expand Down
8 changes: 6 additions & 2 deletions src/runtime/contrib/cublas/cublas.cc
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,8 @@ int roundoff(int v, int d) { return (v + d - 1) / d * d; }
void CallCublasLt(cublasLtHandle_t hdl, cudaStream_t stream,
cublasLtMatmulPreference_t matmul_pref_desc, const DLTensor* A, const DLTensor* B,
const DLTensor* bias, const DLTensor* C, bool transa, bool transb,
void* workspace_ptr, size_t workspace_size, cublasLtEpilogue_t epilogue) {
void* workspace_ptr, size_t workspace_size, cublasLtEpilogue_t epilogue,
std::optional<float> dq_scale) {
ICHECK(TypeEqual(A->dtype, B->dtype));
// Reversed strides indicates an in-place transpose operation.
transa = IsInPlaceTransposed(A) ? !transa : transa;
Expand All @@ -152,7 +153,10 @@ void CallCublasLt(cublasLtHandle_t hdl, cudaStream_t stream,
float zero_fp32 = 0.0;
int32_t one_i32 = 1;
int32_t zero_i32 = 0;
void* alpha = &one_fp32;
// Pass dequantization scale through the "alpha" parameter. If there is no dequantization after
// matmul, then alpha == 1.0
float alpha_value = dq_scale.value_or(one_fp32);
void* alpha = &alpha_value;
void* beta = &zero_fp32;

if (TypeMatch(A->dtype, kDLFloat, 16)) {
Expand Down
8 changes: 7 additions & 1 deletion src/runtime/contrib/cublas/cublas_json_runtime.cc
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,15 @@ class CublasJSONRuntime : public JSONRuntimeBase {

auto [a_ptr, b_ptr, bias_ptr] = get_inputs(node, epilogue != CUBLASLT_EPILOGUE_DEFAULT);

std::optional<float> dq_scale = std::nullopt;
if (op_name.find("dequantize") != std::string::npos) {
dq_scale = std::stof(node.GetAttr<std::vector<std::string>>("dq_scale")[0]);
}

tvm::contrib::CallCublasLt(entry_ptr->handle, stream, entry_ptr->matmul_pref_desc, a_ptr,
b_ptr, bias_ptr, out_ptr, transa, transb,
entry_ptr->workspace_ptr, entry_ptr->workspace_size, epilogue);
entry_ptr->workspace_ptr, entry_ptr->workspace_size, epilogue,
dq_scale);
}
}
}
Expand Down
4 changes: 3 additions & 1 deletion src/runtime/contrib/cublas/cublas_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#if CUDART_VERSION >= 10010
#include <cublasLt.h>
#endif // CUDART_VERSION >= 10010
#include <optional>

namespace tvm {
namespace contrib {
Expand Down Expand Up @@ -124,7 +125,8 @@ void CallCublasLt(cublasLtHandle_t hdl, cudaStream_t stream,
cublasLtMatmulPreference_t matmul_pref_desc, const DLTensor* A, const DLTensor* B,
const DLTensor* bias, const DLTensor* C, bool transa, bool transb,
void* workspace_ptr, size_t workspace_size,
cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT);
cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT,
std::optional<float> dq_scale = std::nullopt);

} // namespace contrib
} // namespace tvm
Expand Down
92 changes: 90 additions & 2 deletions tests/python/relax/test_codegen_cublas.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
from tvm.relax.backend.contrib.cublas import partition_for_cublas
from tvm.relax.testing import get_relax_matmul_module
from tvm.script import relax as R
from tvm.script.ir_builder import IRBuilder
from tvm.script.ir_builder import relax as relax_builder

try:
import ml_dtypes
Expand Down Expand Up @@ -60,8 +62,8 @@ def build_and_run(mod, inputs_np, target, legalize=False, cuda_graph=False):
return f(*inputs).numpy()


def get_result_with_relax_cublas_offload(mod, np_inputs, cuda_graph=False):
mod = partition_for_cublas(mod)
def get_result_with_relax_cublas_offload(mod, np_inputs, cuda_graph=False, bind_constants=False):
mod = partition_for_cublas(mod, bind_constants=bind_constants)
mod = relax.transform.RunCodegen()(mod)

return build_and_run(mod, np_inputs, "cuda", cuda_graph)
Expand Down Expand Up @@ -95,6 +97,43 @@ def _to_concrete_shape(symbolic_shape, var_table):
}


def get_relax_matmul_dequantize_module(
x_shape,
y_shape,
in_dtype,
out_dtype,
transposed_y=False,
scale_const=1.0,
zero_point_const=0.0,
):
"""Create a matmul op followd by dequantize operations."""
with IRBuilder() as builder:
with relax_builder.function():
R.func_name("main")
x = R.arg("x", R.Tensor(x_shape, in_dtype))
y = R.arg("y", R.Tensor(y_shape, in_dtype))

with R.dataflow() as frame:
if transposed_y:
axes = list(range(len(y_shape) - 2)) + [-1, -2]
y = R.emit(R.permute_dims(y, axes=axes))
result = R.emit(R.matmul(x, y, out_dtype="float32"))
result = R.emit(
R.dequantize(
result,
scale=R.const(scale_const, "float16"),
zero_point=R.const(zero_point_const, "float16"),
axis=-1,
out_dtype=out_dtype,
)
)
R.output(result)
R.func_ret_value(frame.output_vars[0])

func = builder.get()
return tvm.IRModule({"main": func})


@pytest.mark.parametrize(
"x_shape, y_shape, transpose_y, epilogue",
[
Expand Down Expand Up @@ -262,6 +301,32 @@ def test_matmul_fp8_offload(
tvm.testing.assert_allclose(out, ref_out, rtol=1e-3, atol=1e-3)


@tvm.testing.requires_cuda_compute_version(9)
@pytest.mark.skipif(ml_dtypes is None, reason="requires ml_dtypes to be installed")
def test_matmul_fp8_dequantize_offload():
x_shape = (10, 32)
y_shape = (64, 32)
in_dtype = "e4m3_float8"
mod = get_relax_matmul_dequantize_module(
x_shape,
y_shape,
in_dtype,
"float16",
transposed_y=True,
scale_const=0.34786,
zero_point_const=0.0,
)

numpytype = "float8_e4m3fn"
x = np.random.uniform(low=0, high=5, size=x_shape).astype(numpytype)
y = np.random.uniform(low=0, high=5, size=y_shape).astype(numpytype)
args = (x, y)

out = get_result_with_relax_cublas_offload(mod, args, bind_constants=True)
ref = build_and_run(mod, args, "llvm", legalize=True)
tvm.testing.assert_allclose(out, ref, rtol=1e-3, atol=1e-3)


@pytest.mark.parametrize(
"M, N, K, out_dtype, transposed_y, partition_done",
[
Expand All @@ -283,6 +348,29 @@ def test_cublas_partition_fp8_matmul(M, N, K, out_dtype, transposed_y, partition
assert func_name in mod["main"].script()


@pytest.mark.parametrize(
"M, N, K, scale, zp, num_bindings",
[
(16, 64, 32, 2.0, 0.0, 1),
(16, 64, 32, 2.0, 1.0, 2),
(16, 64, 32, [2.0] * 64, [2.0] * 64, 2),
],
)
def test_cublas_partition_fp8_matmul_dequantize(M, N, K, scale, zp, num_bindings):
mod = get_relax_matmul_dequantize_module(
(M, K),
(N, K),
"e4m3_float8",
"float16",
transposed_y=True,
scale_const=scale,
zero_point_const=zp,
)
mod = partition_for_cublas(mod)
# Check whether R.dequantize is still in main function or not
assert len(mod["main"].body.blocks[0].bindings) == num_bindings


def test_cublas_partition_matmul_without_bias():
# cuBLAS does not handle 2D bias (residual input)
mod = get_relax_matmul_module((16, 32), (32, 32), "float16", "float16", bias_shape=(16, 32))
Expand Down