#include "macro.h" #include #include #include #include #ifndef NDEBUG #include #endif namespace triton_tvm_ffi { tvm::ffi::Map GetDeviceProperties(int device_id) { tvm::ffi::cuda_api::DeviceHandle device; CUDA_CHECK(cuDeviceGet(&device, device_id)); int maxSharedMem = 0; int maxNumRegs = 0; int multiprocessorCount = 0; int warpSize = 0; int smClockRate = 0; int memClockRate = 0; int memBusWidth = 0; CUDA_CHECK(cuDeviceGetAttribute( &maxSharedMem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, device)); CUDA_CHECK(cuDeviceGetAttribute( &maxNumRegs, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, device)); CUDA_CHECK(cuDeviceGetAttribute( &multiprocessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device)); CUDA_CHECK( cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device)); CUDA_CHECK(cuDeviceGetAttribute(&smClockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device)); CUDA_CHECK(cuDeviceGetAttribute( &memClockRate, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device)); CUDA_CHECK(cuDeviceGetAttribute( &memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device)); return {{"max_shared_mem", maxSharedMem}, {"max_num_regs", maxNumRegs}, {"multiprocessor_count", multiprocessorCount}, {"warpSize", warpSize}, {"sm_clock_rate", smClockRate}, {"mem_clock_rate", memClockRate}, {"mem_bus_width", memBusWidth}}; } tvm::ffi::Tuple LoadBinary(const tvm::ffi::String &name, const tvm::ffi::Bytes &data, int32_t shared, CUdevice device) { CUcontext pctx; CUfunction fun; CUmodule mod; int32_t nRegs = 0; int32_t nSpills = 0; int32_t nMaxThreads = 0; int32_t sharedOptin = 0; CUDA_CHECK(cuCtxGetCurrent(&pctx)); if (!pctx) { CUDA_CHECK(cuDevicePrimaryCtxRetain(&pctx, device)); CUDA_CHECK(cuCtxSetCurrent(pctx)); } CUDA_CHECK(cuModuleLoadData(&mod, data.data())); CUDA_CHECK(cuModuleGetFunction(&fun, mod, name.data())); CUDA_CHECK(cuFuncGetAttribute(&nRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, fun)); CUDA_CHECK( cuFuncGetAttribute(&nSpills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun)); CUDA_CHECK(cuFuncGetAttribute(&nMaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, fun)); CUDA_CHECK(cuDeviceGetAttribute( &sharedOptin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, device)); static constexpr int64_t kExpectedMaxDynamicSharedMemory = 49152; if (shared > kExpectedMaxDynamicSharedMemory && sharedOptin > kExpectedMaxDynamicSharedMemory) { CUDA_CHECK(cuFuncSetCacheConfig(fun, CU_FUNC_CACHE_PREFER_SHARED)); int32_t sharedTotal = 0, sharedStatic = 0; CUDA_CHECK(cuDeviceGetAttribute( &sharedTotal, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, device)); CUDA_CHECK(cuFuncGetAttribute(&sharedStatic, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, fun)); CUDA_CHECK( cuFuncSetAttribute(fun, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, sharedOptin - sharedStatic)); } return tvm::ffi::Tuple{ mod, fun, nRegs, nSpills, nMaxThreads}; } TVM_FFI_STATIC_INIT_BLOCK() { namespace refl = tvm::ffi::reflection; refl::GlobalDef() .def_packed("triton_tvm_ffi.utils.build_signature_metadata", [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) { throw NotImplementedException("build_signature_metadata"); }) .def_packed("triton_tvm_ffi.utils.cuOccupancyMaxActiveClusters", [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) { throw NotImplementedException( "cuOccupancyMaxActiveClusters"); }) .def_packed("triton_tvm_ffi.utils.fill_tma_descriptor", [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) { throw NotImplementedException("fill_tma_descriptor"); }) .def_packed("triton_tvm_ffi.utils.set_printf_fifo_size", [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) { throw NotImplementedException("set_printf_fifo_size"); }) .def("triton_tvm_ffi.utils.get_device_properties", GetDeviceProperties) .def("triton_tvm_ffi.utils.load_binary", LoadBinary); } } // namespace triton_tvm_ffi