#include "macro.h"
#include <cuda.h>
#include <tvm/ffi/extra/cuda/cubin_launcher.h>
#include <tvm/ffi/reflection/registry.h>
#include <tvm/ffi/tvm_ffi.h>
#ifndef NDEBUG
#include <cassert>
#endif

namespace triton_tvm_ffi {

tvm::ffi::Map<tvm::ffi::String, int32_t> GetDeviceProperties(int device_id) {
  tvm::ffi::cuda_api::DeviceHandle device;
  CUDA_CHECK(cuDeviceGet(&device, device_id));
  int maxSharedMem = 0;
  int maxNumRegs = 0;
  int multiprocessorCount = 0;
  int warpSize = 0;
  int smClockRate = 0;
  int memClockRate = 0;
  int memBusWidth = 0;
  CUDA_CHECK(cuDeviceGetAttribute(
      &maxSharedMem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
      device));
  CUDA_CHECK(cuDeviceGetAttribute(
      &maxNumRegs, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, device));
  CUDA_CHECK(cuDeviceGetAttribute(
      &multiprocessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
  CUDA_CHECK(
      cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device));
  CUDA_CHECK(cuDeviceGetAttribute(&smClockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
                                  device));
  CUDA_CHECK(cuDeviceGetAttribute(
      &memClockRate, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device));
  CUDA_CHECK(cuDeviceGetAttribute(
      &memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device));
  return {{"max_shared_mem", maxSharedMem},
          {"max_num_regs", maxNumRegs},
          {"multiprocessor_count", multiprocessorCount},
          {"warpSize", warpSize},
          {"sm_clock_rate", smClockRate},
          {"mem_clock_rate", memClockRate},
          {"mem_bus_width", memBusWidth}};
}

tvm::ffi::Tuple<uint64_t, uint64_t, int32_t, int32_t, int32_t>
LoadBinary(const tvm::ffi::String &name, const tvm::ffi::Bytes &data,
           int32_t shared, CUdevice device) {
  CUcontext pctx;
  CUfunction fun;
  CUmodule mod;
  int32_t nRegs = 0;
  int32_t nSpills = 0;
  int32_t nMaxThreads = 0;
  int32_t sharedOptin = 0;
  CUDA_CHECK(cuCtxGetCurrent(&pctx));
  if (!pctx) {
    CUDA_CHECK(cuDevicePrimaryCtxRetain(&pctx, device));
    CUDA_CHECK(cuCtxSetCurrent(pctx));
  }
  CUDA_CHECK(cuModuleLoadData(&mod, data.data()));
  CUDA_CHECK(cuModuleGetFunction(&fun, mod, name.data()));
  CUDA_CHECK(cuFuncGetAttribute(&nRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, fun));
  CUDA_CHECK(
      cuFuncGetAttribute(&nSpills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun));
  CUDA_CHECK(cuFuncGetAttribute(&nMaxThreads,
                                CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, fun));
  CUDA_CHECK(cuDeviceGetAttribute(
      &sharedOptin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
      device));
  static constexpr int64_t kExpectedMaxDynamicSharedMemory = 49152;
  if (shared > kExpectedMaxDynamicSharedMemory &&
      sharedOptin > kExpectedMaxDynamicSharedMemory) {
    CUDA_CHECK(cuFuncSetCacheConfig(fun, CU_FUNC_CACHE_PREFER_SHARED));
    int32_t sharedTotal = 0, sharedStatic = 0;
    CUDA_CHECK(cuDeviceGetAttribute(
        &sharedTotal, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
        device));
    CUDA_CHECK(cuFuncGetAttribute(&sharedStatic,
                                  CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, fun));
    CUDA_CHECK(
        cuFuncSetAttribute(fun, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
                           sharedOptin - sharedStatic));
  }
  return tvm::ffi::Tuple<uint64_t, uint64_t, int32_t, int32_t, int32_t>{
      mod, fun, nRegs, nSpills, nMaxThreads};
}

TVM_FFI_STATIC_INIT_BLOCK() {
  namespace refl = tvm::ffi::reflection;
  refl::GlobalDef()
      .def_packed("triton_tvm_ffi.utils.build_signature_metadata",
                  [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
                    throw NotImplementedException("build_signature_metadata");
                  })
      .def_packed("triton_tvm_ffi.utils.cuOccupancyMaxActiveClusters",
                  [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
                    throw NotImplementedException(
                        "cuOccupancyMaxActiveClusters");
                  })
      .def_packed("triton_tvm_ffi.utils.fill_tma_descriptor",
                  [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
                    throw NotImplementedException("fill_tma_descriptor");
                  })
      .def_packed("triton_tvm_ffi.utils.set_printf_fifo_size",
                  [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
                    throw NotImplementedException("set_printf_fifo_size");
                  })
      .def("triton_tvm_ffi.utils.get_device_properties", GetDeviceProperties)
      .def("triton_tvm_ffi.utils.load_binary", LoadBinary);
}

} // namespace triton_tvm_ffi