tile-ai · LeiWang1999 · Jul 30, 2025 · Jun 13, 2025 · Jun 13, 2025 · Jun 13, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -7,7 +7,7 @@ env:
 
 jobs:
   format-check:
-    runs-on: ubuntu-latest
+    runs-on: self-hosted
 
     permissions:
       contents: write 
@@ -26,21 +26,37 @@ jobs:
       with:
         python-version: ${{ env.PYTHON_VERSION }}
 
-    - name: Install dependencies
+    - name: Ensure venv (local & persistent)
       run: |
-        python -m pip install --upgrade pip
-        pip install yapf==0.40.2 toml==0.10.2 tomli==2.0.1 ruff==0.6.5 codespell==2.3.0 clang-format==15.0.7
+        set -e
+        REQS_HASH=$(cat requirements-test.txt 2>/dev/null || true)
+        MARKER="${{ runner.tool_cache }}/.venv_marker_${{ env.PYTHON_VERSION }}_${REQS_HASH:0:8}"
+
+        if [[ -f "$MARKER" ]] && [[ -f "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate" ]]; then
+          echo "venv exists and hash matches – reuse it"
+        else
+          echo "venv stale or missing – recreating"
+          rm -rf "${{ runner.tool_cache }}/${{ env.VENV_DIR }}" "$MARKER"
+          python -m venv "${{ runner.tool_cache }}/${{ env.VENV_DIR }}"
+          # shellcheck source=/dev/null
+          source "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate"
+          python -m pip install --upgrade pip --no-user
+          [[ -f requirements-test.txt ]] && \
+            PIP_NO_BUILD_ISOLATION=1 pip install -r requirements-test.txt --no-user
+          pip install . --no-user
+          touch "$MARKER"
+        fi
 
     - name: Run format check
       run: |
-        git clone https://github.com/tile-ai/tilelang.git main_repo
-        cp main_repo/format.sh .
-        rm -rf main_repo
+        source "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate"
         if ! output=$(./format.sh 2>&1); then
+          echo "------------------------------------"
           echo "message:"
           echo "$output"
-          echo "------------------------------------"
           printf '%s\n' "$output" | grep "Please review and stage the changes."
+          echo "------------------------------------"
+          exit 1
         fi
 
     - name: Commit and Push Changes

diff --git a/3rdparty/tvm b/3rdparty/tvm
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -11,6 +11,14 @@ endif()
 
 # Enable compile command export
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+if(NOT Python_EXECUTABLE)
+  execute_process(
+    COMMAND which python
+    OUTPUT_VARIABLE Python_EXECUTABLE
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+  )
+  set(Python_EXECUTABLE "${Python_EXECUTABLE}" CACHE FILEPATH "Path to the Python executable")
+endif()
 
 # Define a custom macro for globbing files with conditional CONFIGURE_DEPENDS
 if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0")
@@ -39,7 +47,8 @@ else()
 
   # Set default build type to RelWithDebInfo if not provided
   if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING "Build type" FORCE)
+  # Set default build type to Release if not provided
+    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
     message(STATUS "Setting default build type to ${CMAKE_BUILD_TYPE}")
   endif()
 endif()
@@ -145,6 +154,7 @@ message(STATUS "TVM_SOURCE_DIR: ${TVM_SOURCE_DIR}")
 # Include directories for TileLang
 set(TILE_LANG_INCLUDES
   ${TVM_SOURCE_DIR}/include
+  ${TVM_SOURCE_DIR}/ffi/include
   ${TVM_SOURCE_DIR}/src
   ${TVM_SOURCE_DIR}/3rdparty/dlpack/include
   ${TVM_SOURCE_DIR}/3rdparty/dmlc-core/include
@@ -212,6 +222,11 @@ if(CMAKE_BUILD_TYPE STREQUAL "Debug")
   target_compile_definitions(tilelang_static PRIVATE "TVM_LOG_DEBUG")
 endif()
 
+# Building tvm_cython modules
+if(NOT DEFINED TVM_PREBUILD_PATH)
+  add_dependencies(tilelang tvm_cython)
+endif()
+
 # Module shared library
 add_library(tilelang_module SHARED $<TARGET_OBJECTS:tilelang_objs>)
 target_link_libraries(tilelang_module PUBLIC tvm)

diff --git a/benchmark/matmul_fp8/benchmark_matmul.py b/benchmark/matmul_fp8/benchmark_matmul.py
@@ -54,10 +54,8 @@ def get_configs(args, kwargs):
         from tilelang.carver.roller.rasterization import NoRasterization
         import torch
 
-        if torch.version.hip is not None:
-            arch=CDNA("hip")
-        else:
-            arch = CUDA("cuda")
+        arch = CDNA("hip") if torch.version.hip is not None else CUDA("cuda")
+
         topk = 10
 
         carve_template = MatmulTemplate(
@@ -158,7 +156,7 @@ def matmul(
 
     # Use half-precision for input data to reduce memory bandwidth,
     # accumulate in float for better numerical accuracy
-    dtype = "e4m3_float8"
+    dtype = "float8_e4m3"
     accum_dtype = "float"
 
     @T.prim_func

diff --git a/examples/bitnet-1.58b/requirements.txt b/examples/bitnet-1.58b/requirements.txt
@@ -1,3 +1,3 @@
 lm_eval==0.3.0
 flash_attn
-transformers==4.52.1
+transformers==4.52.1
diff --git a/examples/cast/example_group_per_split_token_cast_to_fp8.py b/examples/cast/example_group_per_split_token_cast_to_fp8.py
@@ -17,7 +17,7 @@ def group_per_split_token_cast_to_fp8(M, M_max, N, BG, blk_m):
 
     @T.prim_func
     def group_per_split_token_cast(X: T.Tensor((M, N), dtype), batch_sizes: T.Tensor(
-        (BG,), "int32"), X_fp8: T.Tensor((BG, M_max, N), "e4m3_float8"), X_amax: T.Tensor(
+        (BG,), "int32"), X_fp8: T.Tensor((BG, M_max, N), "float8_e4m3"), X_amax: T.Tensor(
             (BG, M_max, T.ceildiv(N, group_size)), accum_dtype)):
         with T.Kernel(
                 T.ceildiv(M_max, blk_m), T.ceildiv(N, group_size), BG, threads=128) as (bx, by, bz):
@@ -28,7 +28,7 @@ def group_per_split_token_cast(X: T.Tensor((M, N), dtype), batch_sizes: T.Tensor
             y_amax_local = T.alloc_fragment((blk_m,), accum_dtype)
             y_s_local = T.alloc_fragment((blk_m,), accum_dtype)
             y_q_local = T.alloc_fragment((blk_m, group_size), accum_dtype)
-            y_q_local_fp8 = T.alloc_fragment((blk_m, group_size), "e4m3_float8")
+            y_q_local_fp8 = T.alloc_fragment((blk_m, group_size), "float8_e4m3")
             row_offset = T.alloc_local((1,), "int32")
 
             T.annotate_layout({

diff --git a/examples/cast/example_per_token_cast_to_fp8.py b/examples/cast/example_per_token_cast_to_fp8.py
@@ -15,7 +15,7 @@ def per_token_cast_to_fp8(M, N, blk_m):
     fp8_max = 448.0
 
     @T.prim_func
-    def per_token_cast(X: T.Tensor((M, N), dtype), X_fp8: T.Tensor((M, N), "e4m3_float8"),
+    def per_token_cast(X: T.Tensor((M, N), dtype), X_fp8: T.Tensor((M, N), "float8_e4m3"),
                        X_amax: T.Tensor((M, T.ceildiv(N, group_size)), dtype)):
         with T.Kernel(T.ceildiv(M, blk_m), T.ceildiv(N, group_size), threads=128) as (bx, by):
             row = bx
@@ -24,7 +24,7 @@ def per_token_cast(X: T.Tensor((M, N), dtype), X_fp8: T.Tensor((M, N), "e4m3_flo
             y_amax_local = T.alloc_fragment((blk_m,), dtype)
             y_s_local = T.alloc_fragment((blk_m,), dtype)
             y_q_local = T.alloc_fragment((blk_m, group_size), dtype)
-            y_q_local_fp8 = T.alloc_fragment((blk_m, group_size), "e4m3_float8")
+            y_q_local_fp8 = T.alloc_fragment((blk_m, group_size), "float8_e4m3")
 
             T.annotate_layout({
                 y_local:

diff --git a/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py b/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py
@@ -20,8 +20,8 @@ def tl_gemm(
     accum_dtype,
 ):
     assert in_dtype in [
-        "e4m3_float8",
-    ], "Currently only e4m3_float8 is supported"
+        "float8_e4m3",
+    ], "Currently only float8_e4m3 is supported"
     assert out_dtype in [
         "bfloat16",
         "float32",
@@ -179,11 +179,11 @@ def assert_tl_gemm_correctness(M, N, K, block_N, in_dtype, out_dtype, accum_dtyp
 
 
 def main():
-    assert_tl_gemm_correctness(1024, 1024, 8192, 128, "e4m3_float8", "bfloat16", "float32")
+    assert_tl_gemm_correctness(1024, 1024, 8192, 128, "float8_e4m3", "bfloat16", "float32")
 
 
 if __name__ == "__main__":
-    for dtype in ["e4m3_float8"]:
+    for dtype in ["float8_e4m3"]:
         for out_dtype in ["bfloat16", "float32"]:
             for block_N in [16, 32, 64, 128]:
                 assert_tl_gemm_correctness(1024, 1024, 8192, block_N, dtype, out_dtype, "float32")
diff --git a/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py b/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py
@@ -11,7 +11,7 @@
 def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H):
     scale = (1.0 / (dim + pe_dim))**0.5 * 1.44269504  # log2(e)
     dtype = "float16"
-    q_dtype = "e4m3_float8"
+    q_dtype = "float8_e4m3"
     accum_dtype = "float"
     kv_group_num = heads // kv_head_num
     VALID_BLOCK_H = min(block_H, kv_group_num)

diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8.py b/examples/gemm_fp8/example_tilelang_gemm_fp8.py
@@ -57,8 +57,8 @@ def test_gemm_fp8(M, N, K, dtype):
 
 
 def main():
-    test_gemm_fp8(1024, 1024, 1024, 'e4m3_float8')
-    test_gemm_fp8(1024, 1024, 1024, 'e5m2_float8')
+    test_gemm_fp8(1024, 1024, 1024, 'float8_e4m3')
+    test_gemm_fp8(1024, 1024, 1024, 'float8_e5m2')
 
 
 if __name__ == "__main__":

diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py
@@ -74,8 +74,8 @@ def test_gemm_fp8(M, N, K, dtype):
 
 
 def main():
-    test_gemm_fp8(1024, 1024, 8192, 'e4m3_float8')
-    test_gemm_fp8(1024, 1024, 8192, 'e5m2_float8')
+    test_gemm_fp8(1024, 1024, 8192, 'float8_e4m3')
+    test_gemm_fp8(1024, 1024, 8192, 'float8_e5m2')
 
 
 if __name__ == "__main__":

diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
@@ -40,8 +40,8 @@ def tl_matmul(
 ):
     assert in_dtype in [
         "float16",
-        "e4m3_float8",
-        "e5m2_float8",
+        "float8_e4m3",
+        "float8_e5m2",
         "int8",
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
@@ -52,7 +52,7 @@ def tl_matmul(
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    is_float8 = in_dtype in ["e4m3_float8", "e5m2_float8"]
+    is_float8 = in_dtype in ["float8_e4m3", "float8_e5m2"]
     if out_dtype == "int32" or is_float8:
         micro_size_k = 32
 
@@ -216,8 +216,8 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 
 
 def main():
-    assert_tl_matmul_correctness(128, 128, 128, "e4m3_float8", "float32", "float32")
-    assert_tl_matmul_correctness(128, 128, 128, "e5m2_float8", "float32", "float32")
+    assert_tl_matmul_correctness(128, 128, 128, "float8_e4m3", "float32", "float32")
+    assert_tl_matmul_correctness(128, 128, 128, "float8_e5m2", "float32", "float32")
 
 
 if __name__ == "__main__":

diff --git a/examples/warp_specialize/example_warp_specialize_flashmla.py b/examples/warp_specialize/example_warp_specialize_flashmla.py
@@ -9,6 +9,7 @@
 tilelang.disable_cache()
 
 
+@tilelang.jit(out_idx=[6])
 def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split):
     scale = (1.0 / (dim + pe_dim))**0.5 * 1.44269504  # log2(e)
     dtype = "float16"
@@ -79,7 +80,6 @@ def flash_attn(
             p0_1_1_ready_barrier = T.alloc_barrier(arrive_count=128)
             lse_0_ready_barrier = T.alloc_barrier(arrive_count=128)
             lse_1_ready_barrier = T.alloc_barrier(arrive_count=128)
-            s_shared_ready_barrier = T.alloc_barrier(arrive_count=128)
             q_shared_ready_barrier = T.alloc_barrier(arrive_count=256)
             k_pe_shared_1_free_barrier = T.alloc_barrier(arrive_count=128)
             k_pe_shared_0_free_barrier = T.alloc_barrier(arrive_count=128)
@@ -401,8 +401,7 @@ def main(batch=1, heads=128, kv_heads=1, kv_ctx=8192, dim=512, pe_dim=64):
     BLOCK_H = 64
     num_split = 1
 
-    program = flashattn(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split)
-    kernel = tilelang.compile(program, out_idx=[6])
+    kernel = flashattn(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split)
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
     profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)
     latency = profiler.do_bench(warmup=500)

diff --git a/requirements-build.txt b/requirements-build.txt
@@ -1,4 +1,5 @@
 # Should be mirrored in pyproject.toml
+Cython
 build
 cmake>=3.26
 packaging

diff --git a/requirements-test.txt b/requirements-test.txt
@@ -1,6 +1,7 @@
 # lint requirements
 -r requirements-lint.txt
 # build requirements
+Cython
 cmake>=3.26
 # runtime requirements
 cffi

diff --git a/setup.py b/setup.py
@@ -815,7 +815,7 @@ def build_cmake(self, ext):
         # -DCMAKE_LIBRARY_OUTPUT_DIRECTORY sets where built libraries go
         # -DPYTHON_EXECUTABLE ensures that the correct Python is used
         cmake_args = [
-            f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}", f"-DPYTHON_EXECUTABLE={sys.executable}",
+            f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}", f"-DPython_EXECUTABLE={sys.executable}",
             f"-DCMAKE_BUILD_TYPE={'Debug' if DEBUG_MODE else 'Release'}"
         ]
         if not USE_ROCM: