package device

import (
	"bytes"
	"errors"
	"fmt"
	"io/ioutil"
	"os"
	"strconv"
	"strings"
	"syscall"

	"a.yandex-team.ru/infra/rsm/nvgpumanager/vendor/github.com/NVIDIA/go-nvml/pkg/nvml"

	opentracing "github.com/opentracing/opentracing-go"
	"go.uber.org/zap"
	"golang.org/x/sys/unix"

	"a.yandex-team.ru/infra/rsm/nvgpumanager/internal/config"
	"a.yandex-team.ru/infra/rsm/nvgpumanager/internal/ilog"
	"a.yandex-team.ru/infra/rsm/nvgpumanager/internal/utils"
	"a.yandex-team.ru/infra/rsm/nvgpumanager/pkg/modprobe"
)

const (
	KB = 1024
	MB = (1024 * KB)

	NvidiaBlackListConfig = "/etc/modprobe.d/blacklist-nvgpu-manager.conf"
	// FIXME: remove version hardcode https://st.yandex-team.ru/RESMAN-9

	NvidiaCtlDev    = "/dev/nvidiactl"
	NvidiaNvlinkDev = "/dev/nvidia-nvlink"

	NvidiaUvmDevName   = "nvidia-uvm"
	NvidiaUvmDev       = "/dev/nvidia-uvm"
	NvidiaUvmToolsDev  = "/dev/nvidia-uvm-tools"
	NvidiaUvmDevsPerms = 0666
)

var (
	NvidiaModuleList = []modprobe.Module{
		modprobe.Module{
			Names: []string{"nvidia_drm", "nvidia_418_drm", "nvidia_450_drm"},
			Alias: "nvidia_drm",
		},
		modprobe.Module{
			Names: []string{"nvidia_modeset", "nvidia_418_modeset", "nvidia_450_modeset"},
			Alias: "nvidia_modeset",
		},
		modprobe.Module{
			Names: []string{"nvidia_uvm", "nvidia_418_uvm", "nvidia_450_uvm"},
			Alias: "nvidia_uvm",
		},
		modprobe.Module{
			Names: []string{"nvidia", "nvidia_418", "nvidia_450"},
			Alias: "nvidia",
		},
	}

	uvmDevsInitialized bool
)

type NvmlAPIDevice interface {
	GetDevice() nvml.Device
	GetUUID() string
	GetModel() string
	GetPathes() []string
	GetUniquePath() string
	GetPower() uint32
	GetMemory() uint64
	GetCPUAffinity() uint
	GetPCIBusID() string
	IsMigDevice() bool
	IsMigEnabled() bool
	GetMinor() uint32
}

type NvmlAPIDeviceStatus struct {
	Power uint32
	// FanSpeed    *uint
	Temperature uint32
	Utilization UtilizationInfo
	Memory      MemoryInfo
	Clocks      ClockInfo
	PCI         PCIStatusInfo
	Processes   []nvml.ProcessInfo
	Throttle    ThrottleReason
	// Performance PerfState
}

type UtilizationInfo struct {
	GPU     uint32
	Memory  uint32
	Encoder uint32
	Decoder uint32
}

type MemoryInfo struct {
	Global    nvml.Memory
	ECCErrors nvml.EccErrorCounts
}

type ClockInfo struct {
	Cores  uint32
	Memory uint32
}

type PCIThroughputInfo struct {
	RX uint32
	TX uint32
}

type PCIStatusInfo struct {
	BAR1Used   uint64
	Throughput PCIThroughputInfo
}

type ThrottleReason uint64

func (r ThrottleReason) String() string {
	switch r {
	case nvml.ClocksThrottleReasonGpuIdle:
		return "Gpu Idle"
	case nvml.ClocksThrottleReasonApplicationsClocksSetting:
		return "Applications Clocks Setting"
	case nvml.ClocksThrottleReasonSwPowerCap:
		return "SW Power Cap"
	case nvml.ClocksThrottleReasonHwSlowdown:
		return "HW Slowdown"
	case nvml.ClocksThrottleReasonSyncBoost:
		return "Sync Boost"
	case nvml.ClocksThrottleReasonSwThermalSlowdown:
		return "SW Thermal Slowdown"
	case nvml.ClocksThrottleReasonHwThermalSlowdown:
		return "HW Thermal Slowdown"
	case nvml.ClocksThrottleReasonHwPowerBrakeSlowdown:
		return "HW Power Brake Slowdown"
	case nvml.ClocksThrottleReasonDisplayClockSetting:
		return "Display Clock Setting"
	case nvml.ClocksThrottleReasonNone:
		return "No clocks throttling"
		// case nvml.ClocksThrottleReasonAll
		// 	return "All of throttling reasons"
	}
	return "N/A"
}

// NvmlInterface : Type to reprensent interactions with NVML
type NvmlInterface interface {
	Init(c *config.Configuration) error
	Shutdown() error
	GetDeviceCount() (int, error)
	NewDevice(id int) (*FullNvmlAPIDevice, error)
	DeviceStatus(device NvmlAPIDevice) (*NvmlAPIDeviceStatus, error)
	GetDriverName() string
	GetDriverVersion() (string, error)
	GetCudaDriverVersion() (uint, uint, error)
	// GetCtlDevices returns devices required for GPU management, such as /dev/nvidiactl
	GetCtlDevices(c *config.Configuration) ([]string, error)
	EnableDriver(services []*utils.Service) error
	DisableDriver(services []*utils.Service) error
}

// NvmlLib : Implementation of NvmlInterface using the NVML lib
type NvmlLib struct {
	nvidiaCapsConf *NvidiaCapsConfiguration
}

func initUvmDevs() error {
	err := modprobe.LoadModuleIfUnloaded("nvidia_uvm")
	if err != nil {
		return fmt.Errorf("failed to load nvidia_uvm kernel module, err: %w", err)
	} else {
		err = checkUvmDevs()
		if err != nil {
			return err
		}
	}

	return nil
}

func initNvPeerMem() error {
	if err := modprobe.LoadModuleIfUnloaded("nv_peer_mem"); err != nil {
		err = fmt.Errorf("failed to load nv_peer_mem kernel module, err: %w", err)
		return err
	}

	return nil
}

// Init : Init NVML lib
func (nvmlLib *NvmlLib) Init(c *config.Configuration) error {
	sp := opentracing.StartSpan("nvml.Init")
	sp.SetTag("component", "nvgpu-manager")
	sp.SetTag("span.kind", "server")
	defer sp.Finish()

	var err error
	ret := nvml.Init()
	if ret != nvml.SUCCESS {
		err = fmt.Errorf("nvidia nvml.Init() failed, err: %v", nvml.ErrorString(ret))
		ilog.Log().Error("nvml.Init()", zap.Error(err))
		return err
	}
	utils.SpanCheckError(sp, err)

	if c.NvidiaUvm != config.False {
		err = initUvmDevs()
		if err != nil {
			err = fmt.Errorf("failed to init nvidia_uvm devices, err: %w", err)
			ilog.Log().Error("nvml.Init()", zap.Error(err))

			if c.NvidiaUvm == config.True {
				return err
			}
		} else {
			uvmDevsInitialized = true
		}
	}

	if c.PeerMem {
		if err = initNvPeerMem(); err != nil {
			err = fmt.Errorf("failed to init nv_peer_mem, err: %w", err)
			return err
		}
	}

	nvmlLib.nvidiaCapsConf, err = NewNvidiaCapsConfiguration()
	if err != nil {
		err = fmt.Errorf("failed to init nvidia caps configuration: %w", err)
		return err
	}

	return err
}

// Shutdown : Shutdown NVML lib
func (nvmlLib *NvmlLib) Shutdown() error {
	sp := opentracing.StartSpan("nvml.Shutdown")
	sp.SetTag("component", "nvgpu-manager")
	sp.SetTag("span.kind", "server")
	defer sp.Finish()

	var err error
	ret := nvml.Shutdown()
	if ret != nvml.SUCCESS {
		err = fmt.Errorf("nvidia nvml.Shutdown() failed, err: %v", nvml.ErrorString(ret))
		ilog.Log().Error("nvml.Shutdown()", zap.Error(err))
	}
	utils.SpanCheckError(sp, err)

	return err
}

// GetDeviceCount : Return the number of GPUs using NVML
func (nvmlLib *NvmlLib) GetDeviceCount() (int, error) {
	sp := opentracing.StartSpan("nvml.GetDeviceCount")
	sp.SetTag("component", "nvgpu-manager")
	sp.SetTag("span.kind", "server")
	defer sp.Finish()

	var err error
	nr, ret := nvml.DeviceGetCount()
	if ret != nvml.SUCCESS {
		err = fmt.Errorf("nvidia nvml.DeviceGetCount() failed, err: %v", nvml.ErrorString(ret))
		ilog.Log().Error("nvml.DeviceGetCount()", zap.Error(err))
	}
	utils.SpanCheckError(sp, err)

	return nr, err
}

func _handleNvmlError(funcName string, ret nvml.Return, sp opentracing.Span) error {
	err := fmt.Errorf("%s failed, err: %v", funcName, nvml.ErrorString(ret))
	ilog.Log().Error(funcName, zap.Error(err))
	utils.SpanLogError(sp, err)
	return err
}

func searchForComputeInstances(giInstance *NvGpuInstance, gpuID uint32, nvCapsConf *NvidiaCapsConfiguration) (string, nvml.Return) {
	var ret nvml.Return

	processedCiProfilesIds := map[uint32]bool{}
	for ciProfile := 0; ciProfile < nvml.GPU_INSTANCE_PROFILE_COUNT; ciProfile++ {
		for engProfile := 0; engProfile < nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT; engProfile++ {
			var info nvml.ComputeInstanceProfileInfo
			info, ret = giInstance.GpuInstance.GetComputeInstanceProfileInfo(ciProfile, engProfile)
			if ret != nvml.SUCCESS {
				if ret != nvml.ERROR_NOT_SUPPORTED && ret != nvml.ERROR_INVALID_ARGUMENT { // ciProfile = 6 => return nvml.ERROR_INVALID_ARGUMENT; TODO: undertand why
					return "nvml.GpuInstanceGetComputeInstanceProfileInfo()", ret
				}
				continue
			}

			if processedCiProfilesIds[info.Id] {
				continue
			}
			processedCiProfilesIds[info.Id] = true

			var computeInstances []nvml.ComputeInstance
			computeInstances, ret = giInstance.GpuInstance.GetComputeInstances(&info)
			if ret != nvml.SUCCESS {
				if ret != nvml.ERROR_NOT_SUPPORTED {
					return "'ncml.GpuInstanceGetComputeInstances()'", ret
				}
				continue
			}

			for _, ciInstance := range computeInstances {
				var ciInstanceInfo nvml.ComputeInstanceInfo
				ciInstanceInfo, ret = ciInstance.GetInfo()
				if ret != nvml.SUCCESS {
					return "'nvml.ComputeInstanceGetInfo()'", ret
				}

				minor := nvCapsConf.GetComputeInstanceMinor(gpuID, giInstance.InstanceID, ciInstanceInfo.Id)
				// if Minor == 0 {
				// 	return
				// }

				giInstance.ComputeInstances[ciInstanceInfo.Id] = &NvComputeInstance{
					ComputeInstance: ciInstance,
					InstanceID:      ciInstanceInfo.Id,
					InstanceMinor:   minor,
					Path:            fmt.Sprintf("/dev/nvidia-caps/nvidia-cap%d", minor)}
			}
		}
	}
	return "", nvml.SUCCESS
}

func searchForGpuInstances(dev *FullNvmlAPIDevice, gpuID uint32, nvCapsConf *NvidiaCapsConfiguration) (string, nvml.Return) {
	// for nvml mig api see: 			https://github.com/NVIDIA/go-nvml/blob/c3a16a2b07cf2251cbedb76fa68c9292b22bfa06/pkg/nvml/device.go
	// for nvml constants see :			https://github.com/NVIDIA/go-nvml/blob/c3a16a2b07cf2251cbedb76fa68c9292b22bfa06/pkg/nvml/const.go
	// for nvml mig structs fields see: https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/bindings/go/nvml/mig.go

	var ret nvml.Return

	processedGiProfilesIds := map[uint32]bool{}
	for giProfile := 0; giProfile < nvml.GPU_INSTANCE_PROFILE_COUNT; giProfile++ {

		var info nvml.GpuInstanceProfileInfo
		info, ret = dev.Device.GetGpuInstanceProfileInfo(giProfile)
		if ret != nvml.SUCCESS {
			if ret != nvml.ERROR_NOT_SUPPORTED && ret != nvml.ERROR_INVALID_ARGUMENT { // giProfile = 6 => return nvml.ERROR_INVALID_ARGUMENT; TODO: undertand why
				return "'nvml.DeviceGetGpuInstanceProfileInfo()'", ret
			}
			continue
		}

		if processedGiProfilesIds[info.Id] {
			continue
		}
		processedGiProfilesIds[info.Id] = true

		var gpuInstances []nvml.GpuInstance
		gpuInstances, ret = dev.Device.GetGpuInstances(&info)
		if ret != nvml.SUCCESS {
			if ret != nvml.ERROR_NOT_SUPPORTED {
				return "'nvml.DeviceGetGpuInstances()'", ret
			}
			continue
		}

		for _, giInstance := range gpuInstances {
			var giInstanceInfo nvml.GpuInstanceInfo
			giInstanceInfo, ret = giInstance.GetInfo()
			if ret != nvml.SUCCESS {
				return "'nvml.GpuInstanceGetInfo()'", ret
			}

			minor := nvCapsConf.GetGpuInstanceMinor(gpuID, giInstanceInfo.Id)
			// if minor == 0 {
			// 	return
			// }

			dev.GpuInstances[giInstanceInfo.Id] = &NvGpuInstance{
				GpuInstance:      giInstance,
				InstanceID:       giInstanceInfo.Id,
				InstanceMinor:    minor,
				Path:             fmt.Sprintf("/dev/nvidia-caps/nvidia-cap%d", minor),
				ComputeInstances: make(map[uint32]*NvComputeInstance)}

			var name string
			name, ret = searchForComputeInstances(dev.GpuInstances[giInstanceInfo.Id], gpuID, nvCapsConf)
			if ret != nvml.SUCCESS {
				return name, ret
			}
		}
	}
	return "", nvml.SUCCESS
}

func searchForMigDevices(dev *FullNvmlAPIDevice) (string, nvml.Return) {
	var ret nvml.Return
	var maxNumMigDevices int

	maxNumMigDevices, ret = dev.Device.GetMaxMigDeviceCount()
	if ret != nvml.SUCCESS {
		return "'nvml.GetMaxMigDeviceCount()'", ret
	}

	for migIdx := 0; migIdx < maxNumMigDevices; migIdx++ {

		device, ret := dev.Device.GetMigDeviceHandleByIndex(migIdx)
		if ret != nvml.SUCCESS {
			if ret == nvml.ERROR_NOT_FOUND {
				continue
			}
			return "'nvml.GetMigDeviceHandleByIndex()'", ret
		}

		gpuInstanceID, ret := device.GetGpuInstanceId()
		if ret != nvml.SUCCESS {
			return "'nvml.DeviceGetGpuInstanceId()'", ret
		}

		computeInstanceID, ret := device.GetComputeInstanceId()
		if ret != nvml.SUCCESS {
			return "'nvml.DeviceGetComputeInstanceId()'", ret
		}

		uuid, ret := device.GetUUID()
		if ret != nvml.SUCCESS {
			return "'nvml.DeviceGetUUID()'", ret
		}

		model, ret := device.GetName()
		if ret != nvml.SUCCESS {
			if ret != nvml.ERROR_NOT_SUPPORTED {
				return "'nvml.DeviceGetName()'", ret
			}
			model = ""
		}

		gpuInst := dev.GpuInstances[uint32(gpuInstanceID)]
		compInst := gpuInst.ComputeInstances[uint32(computeInstanceID)]

		migDevice := MigNvmlAPIDevice{
			Device:          device,
			ParentDevice:    dev,
			GpuInstance:     gpuInst,
			ComputeInstance: compInst,
			UUID:            uuid,
			Model:           model}

		// Mig devices not supported power managment; return nvml.ERROR_UNKNOWN_ERROR

		// migDevice.Power, ret = device.GetPowerManagementLimit()
		// if ret != nvml.SUCCESS {
		// 	if ret != nvml.ERROR_NOT_SUPPORTED {
		// 		return "'nvml.DeviceGetPowerManagementLimit()'", ret
		// 	}
		// 	migDevice.Power = 0
		// }
		// migDevice.Power /= 1000

		migDevice.Power = 0

		mem, ret := device.GetMemoryInfo()
		if ret != nvml.SUCCESS {
			if ret != nvml.ERROR_NOT_SUPPORTED {
				return "'nvml.GetMemoryInfo()'", ret
			}
			mem = nvml.Memory{}
		}
		migDevice.Memory = mem.Total / MB

		dev.MigDevices = append(dev.MigDevices, &migDevice)
	}

	return "", nvml.SUCCESS
}

// NewDevice : Get all information about a GPU using NVML
func (nvmlLib *NvmlLib) NewDevice(id int) (*FullNvmlAPIDevice, error) {
	sp := opentracing.StartSpan("nvml.NewDevice")
	sp.SetTag("component", "nvgpu-manager")
	sp.SetTag("span.kind", "server")
	defer sp.Finish()

	var err error
	var ret nvml.Return
	var dev FullNvmlAPIDevice

	dev.Device, ret = nvml.DeviceGetHandleByIndex(id)
	if ret != nvml.SUCCESS {
		err = fmt.Errorf("'nvml.NewDevice()' failed, err: %v", nvml.ErrorString(ret))
		ilog.Log().Error("'nvml.NewDevice()'", zap.Error(err))
		utils.SpanCheckError(sp, err)
		return &dev, err
	}

	dev.UUID, ret = dev.Device.GetUUID()
	if ret != nvml.SUCCESS {
		if ret != nvml.ERROR_NOT_SUPPORTED {
			return &dev, _handleNvmlError("'nvml.GetUUID()'", ret, sp)
		}
		dev.UUID = ""
	}

	PCIInfo, ret := dev.Device.GetPciInfo()
	if ret != nvml.SUCCESS {
		if ret != nvml.ERROR_NOT_SUPPORTED {
			return &dev, _handleNvmlError("'nvml.GetPciInfo()'", ret, sp)
		}
		PCIInfo = nvml.PciInfo{}
	} else {
		byteBusID := [32]byte{}
		var i int
		for i = range byteBusID {
			if PCIInfo.BusId[i] == 0 {
				break
			}
			byteBusID[i] = byte(PCIInfo.BusId[i])
		}
		dev.PCIBusID = string(byteBusID[:i])
	}

	// need 'minor' for 'path'
	minor, ret := dev.Device.GetMinorNumber()
	if ret != nvml.SUCCESS {
		if ret != nvml.ERROR_NOT_SUPPORTED {
			return &dev, _handleNvmlError("'nvml.GetMinorNumber()'", ret, sp)
		}
		dev.Minor = 0
	} else {
		if dev.UUID != "" && dev.PCIBusID != "" {
			dev.Minor = uint32(minor)
			dev.Path = fmt.Sprintf("/dev/nvidia%d", dev.Minor)
		}
	}

	dev.Model, ret = dev.Device.GetName()
	if ret != nvml.SUCCESS {
		if ret != nvml.ERROR_NOT_SUPPORTED {
			return &dev, _handleNvmlError("'nvml.GetName()'", ret, sp)
		}
		dev.Model = ""
	}

	dev.Power, ret = dev.Device.GetPowerManagementLimit()
	if ret != nvml.SUCCESS {
		if ret != nvml.ERROR_NOT_SUPPORTED {
			return &dev, _handleNvmlError("'nvml.GetPowerManagementLimit()'", ret, sp)
		}
		dev.Power = 0
	}
	// convert to Watts as it used to be with previous nvml lib:
	dev.Power /= 1000

	mem, ret := dev.Device.GetMemoryInfo()
	if ret != nvml.SUCCESS {
		if ret != nvml.ERROR_NOT_SUPPORTED {
			return &dev, _handleNvmlError("'nvml.GetMemoryInfo()'", ret, sp)
		}
		mem = nvml.Memory{}
	}
	// convert bytes to MB as it used to be with previous nvml lib:
	dev.Memory = mem.Total / MB

	if dev.PCIBusID != "" {
		dev.CPUAffinity, err = numaNode(dev.PCIBusID)
		if err != nil {
			ilog.Log().Error("numaNode() failed", zap.Error(err))
			utils.SpanLogError(sp, err)
			return &dev, err
		}
	}

	// check is MIG mode is enable
	var migMode int
	migMode, _, ret = dev.Device.GetMigMode()
	if ret != nvml.SUCCESS {
		if ret != nvml.ERROR_NOT_SUPPORTED {
			return &dev, _handleNvmlError("'nvml.DeviceGetMigMode()'", ret, sp)
		}
		dev.MigMode = false
	} else {
		dev.MigMode = migMode == nvml.DEVICE_MIG_ENABLE
	}

	if dev.MigMode {
		// search for GPU instances
		var name string

		dev.GpuInstances = make(map[uint32]*NvGpuInstance)

		name, ret = searchForGpuInstances(&dev, uint32(dev.Minor), nvmlLib.nvidiaCapsConf)
		if ret != nvml.SUCCESS {
			return &dev, _handleNvmlError(name, ret, sp)
		}

		name, ret = searchForMigDevices(&dev)
		if ret != nvml.SUCCESS {
			return &dev, _handleNvmlError(name, ret, sp)
		}
	}

	utils.SpanCheckError(sp, err)
	return &dev, err
}

func numaNode(busid string) (uint, error) {
	// discard leading zeros of busid
	b, err := ioutil.ReadFile(fmt.Sprintf("/sys/bus/pci/devices/%s/numa_node", strings.ToLower(busid[4:])))
	if err != nil {
		// XXX report nil if NUMA support isn't enabled
		// TODO: maybe use nil instead of 0
		return 0, nil
	}
	node, err := strconv.ParseInt(string(bytes.TrimSpace(b)), 10, 8)
	if err != nil {
		// TODO: provide proper description of 'ErrCPUAffinity'
		// TODO: maybe use nil instead of 0
		return 0, fmt.Errorf("%s: %v", "CPU Affinity error", err)
	}
	if node < 0 {
		// XXX report nil instead of NUMA_NO_NODE
		// TODO: maybe use nil instead of 0
		return 0, nil
	}

	numaNode := uint(node)
	return numaNode, nil
}

// NewDevice : Get all information about a GPU using NVML
func (nvmlLib *NvmlLib) DeviceStatus(device NvmlAPIDevice) (*NvmlAPIDeviceStatus, error) {
	sp := opentracing.StartSpan("nvml.Device.Status()")
	sp.SetTag("component", "nvgpu-manager")
	sp.SetTag("span.kind", "server")
	defer sp.Finish()

	var err error
	var ret nvml.Return
	var st NvmlAPIDeviceStatus

	dev := device.GetDevice()

	isMigDev, ret := dev.IsMigDeviceHandle()
	if ret != nvml.SUCCESS {
		return &st, _handleNvmlError("'nvml.IsMigDeviceHandle()'", ret, sp)
	}

	parentDev := dev
	if isMigDev {
		parentDev, ret = dev.GetDeviceHandleFromMigDeviceHandle()
		if ret != nvml.SUCCESS {
			return &st, _handleNvmlError("'nvml.GetDeviceHandleFromMigDeviceHandle()'", ret, sp)
		}
	}

	errorSkipCond := func(ret nvml.Return, migSkipCodes ...nvml.Return) bool {
		if ret == nvml.ERROR_NOT_SUPPORTED {
			return true
		}
		if !isMigDev {
			return false
		}
		for _, state := range migSkipCodes {
			if ret == state {
				return true
			}
		}
		return false
	}

	st.Power, ret = dev.GetPowerUsage()
	if ret != nvml.SUCCESS {
		if !errorSkipCond(ret, nvml.ERROR_UNKNOWN) {
			return &st, _handleNvmlError("'nvml.GetPowerUsage()'", ret, sp)
		}
		st.Power = 0
	}
	// convert to Watts as it used to be with previous nvml lib:
	st.Power /= 1000

	st.Temperature, ret = parentDev.GetTemperature(nvml.TEMPERATURE_GPU)
	if ret != nvml.SUCCESS {
		if !errorSkipCond(ret) {
			return &st, _handleNvmlError("'nvml.GetTemperature()'", ret, sp)
		}
		st.Temperature = 0
	}

	var util nvml.Utilization
	util, ret = dev.GetUtilizationRates()
	if ret != nvml.SUCCESS {
		if !errorSkipCond(ret, nvml.ERROR_INVALID_ARGUMENT) {
			return &st, _handleNvmlError("'nvml.GetUtilizationRates()'", ret, sp)
		}
		util = nvml.Utilization{}
	}
	st.Utilization.GPU = util.Gpu
	st.Utilization.Memory = util.Memory

	st.Utilization.Encoder, _, ret = dev.GetEncoderUtilization()
	if ret != nvml.SUCCESS {
		if !errorSkipCond(ret, nvml.ERROR_INVALID_ARGUMENT) {
			return &st, _handleNvmlError("'nvml.GetEncoderUtilization()'", ret, sp)
		}
		st.Utilization.Encoder = 0
	}

	st.Utilization.Decoder, _, ret = dev.GetDecoderUtilization()
	if ret != nvml.SUCCESS {
		if !errorSkipCond(ret, nvml.ERROR_INVALID_ARGUMENT) {
			return &st, _handleNvmlError("'nvml.GetDecoderUtilization()'", ret, sp)
		}
		st.Utilization.Decoder = 0
	}

	st.Memory.Global, ret = dev.GetMemoryInfo()
	if ret != nvml.SUCCESS {
		if !errorSkipCond(ret) {
			return &st, _handleNvmlError("'nvml.GetMemoryInfo()'", ret, sp)
		}
		st.Memory.Global = nvml.Memory{}
	}
	// convert bytes to MB as it used to be with previous nvml lib:
	st.Memory.Global.Total /= MB
	st.Memory.Global.Free /= MB
	st.Memory.Global.Used /= MB

	st.Memory.ECCErrors, ret = dev.GetDetailedEccErrors(nvml.MEMORY_ERROR_TYPE_UNCORRECTED, nvml.VOLATILE_ECC)
	if ret != nvml.SUCCESS {
		if !errorSkipCond(ret, nvml.ERROR_INVALID_ARGUMENT) {
			return &st, _handleNvmlError("'nvml.GetDetailedEccErrors(nvml.MEMORY_ERROR_TYPE_UNCORRECTED, nvml.VOLATILE_ECC)'", ret, sp)
		}
		st.Memory.ECCErrors = nvml.EccErrorCounts{}
	}

	st.Clocks.Cores, ret = parentDev.GetClockInfo(nvml.CLOCK_SM)
	if ret != nvml.SUCCESS {
		if !errorSkipCond(ret) {
			return &st, _handleNvmlError("'nvml.GetClockInfo(nvml.CLOCK_SM)'", ret, sp)
		}
		st.Clocks.Cores = 0
	}

	st.Clocks.Memory, ret = parentDev.GetClockInfo(nvml.CLOCK_MEM)
	if ret != nvml.SUCCESS {
		if !errorSkipCond(ret) {
			return &st, _handleNvmlError("'nvml.GetClockInfo(nvml.CLOCK_MEM)'", ret, sp)
		}
		st.Clocks.Memory = 0
	}

	st.PCI.Throughput.RX, ret = dev.GetPcieThroughput(nvml.PCIE_UTIL_RX_BYTES)
	if ret != nvml.SUCCESS {
		if !errorSkipCond(ret, nvml.ERROR_INVALID_ARGUMENT) {
			return &st, _handleNvmlError("'nvml.GetPcieThroughput(nvml.PCIE_UTIL_RX_BYTES)'", ret, sp)
		}
		st.PCI.Throughput.RX = 0
	}
	// convert KB/s to MB/s as it used to be with previous nvml lib:
	st.PCI.Throughput.RX /= 1000

	st.PCI.Throughput.TX, ret = dev.GetPcieThroughput(nvml.PCIE_UTIL_TX_BYTES)
	if ret != nvml.SUCCESS {
		if !errorSkipCond(ret, nvml.ERROR_INVALID_ARGUMENT) {
			return &st, _handleNvmlError("'nvml.GetPcieThroughput(nvml.PCIE_UTIL_TX_BYTES)'", ret, sp)
		}
		st.PCI.Throughput.TX = 0
	}
	// convert KB/s to MB/s as it used to be with previous nvml lib:
	st.PCI.Throughput.TX /= 1000

	BAR1Mem, ret := dev.GetBAR1MemoryInfo()
	if ret != nvml.SUCCESS {
		if !errorSkipCond(ret) {
			return &st, _handleNvmlError("'nvml.GetBAR1MemoryInfo()'", ret, sp)
		}
		BAR1Mem = nvml.BAR1Memory{}
	}
	// convert bytes to MB as it used to be with previous nvml lib:
	st.PCI.BAR1Used = BAR1Mem.Bar1Used / MB

	st.Processes, ret = dev.GetComputeRunningProcesses()
	if ret != nvml.SUCCESS {
		if !errorSkipCond(ret) {
			return &st, _handleNvmlError("'nvml.GetComputeRunningProcesses()'", ret, sp)
		}
		st.Processes = []nvml.ProcessInfo{}
	} else {
		cPidsSet := make(map[uint32]bool)
		for i := range st.Processes {
			cPidsSet[st.Processes[i].Pid] = true
			// convert bytes to MB as it used to be with previous nvml lib:
			st.Processes[i].UsedGpuMemory /= MB
		}

		gProcesses, ret := dev.GetGraphicsRunningProcesses()
		if ret != nvml.SUCCESS {
			if !errorSkipCond(ret) {
				return &st, _handleNvmlError("'nvml.GetGraphicsRunningProcesses()'", ret, sp)
			}
			st.Processes = []nvml.ProcessInfo{}
		} else {
			for i := range gProcesses {
				if cPidsSet[gProcesses[i].Pid] {
					continue
				}
				// convert bytes to MB as it used to be with previous nvml lib:
				gProcesses[i].UsedGpuMemory /= MB
				st.Processes = append(st.Processes, gProcesses[i])
			}
		}
	}

	throttleReason, ret := dev.GetCurrentClocksThrottleReasons()
	if ret != nvml.SUCCESS {
		if !errorSkipCond(ret, nvml.ERROR_INVALID_ARGUMENT) {
			return &st, _handleNvmlError("'nvml.GetCurrentClocksThrottleReasons()'", ret, sp)
		}
		throttleReason = 0
	}
	st.Throttle = ThrottleReason(throttleReason)

	utils.SpanCheckError(sp, err)
	return &st, err
}

// GetDriverName : Return the driver name
func (nvmlLib *NvmlLib) GetDriverName() string {
	return "nvidia"
}

// GetDriverVersion : Return the driver version using NVML
func (nvmlLib *NvmlLib) GetDriverVersion() (string, error) {
	sp := opentracing.StartSpan("nvml.GetDriverVersion")
	sp.SetTag("component", "nvgpu-manager")
	sp.SetTag("span.kind", "server")
	defer sp.Finish()

	var err error
	ver, ret := nvml.SystemGetDriverVersion()
	if ret != nvml.SUCCESS {
		err = fmt.Errorf("nvidia Device.SystemGetDriverVersion() failed, err: %v", nvml.ErrorString(ret))
		ilog.Log().Error("Device.SystemGetDriverVersion()", zap.Error(err))
	}
	utils.SpanCheckError(sp, err)

	return ver, err
}

// GetCudaDriverVersion : Return the cuda version using NVML
func (nvmlLib *NvmlLib) GetCudaDriverVersion() (uint, uint, error) {
	sp := opentracing.StartSpan("nvml.GetCudaDriverVersion")
	sp.SetTag("component", "nvgpu-manager")
	sp.SetTag("span.kind", "server")
	defer sp.Finish()

	var err error
	var major, minor uint
	ver, ret := nvml.SystemGetCudaDriverVersion_v2()
	if ret != nvml.SUCCESS {
		err = fmt.Errorf("nvidia Device.SystemGetCudaDriverVersion() failed, err: %v", nvml.ErrorString(ret))
		ilog.Log().Error("Device.SystemGetCudaDriverVersion()", zap.Error(err))
	} else {
		major = uint(ver / 1000)
		minor = uint(ver % 1000)
	}
	utils.SpanCheckError(sp, err)

	return major, minor, err
}

// GetCtlDevices returns devices required for GPU management
func (nvmlLib *NvmlLib) GetCtlDevices(c *config.Configuration) ([]string, error) {
	devs := []string{
		NvidiaCtlDev,
	}

	if c.NvidiaUvm == config.True ||
		c.NvidiaUvm == config.Optional && uvmDevsInitialized {

		devs = append(devs, NvidiaUvmDev, NvidiaUvmToolsDev)
	}

	_, err := os.Stat(NvidiaNvlinkDev)
	if err == nil || os.IsExist(err) {
		devs = append(devs, NvidiaNvlinkDev)
	}

	return devs, nil
}

// EnableDriver load nvidia kernel driver
func (nvmlLib *NvmlLib) EnableDriver(services []*utils.Service) error {
	sp := opentracing.StartSpan("nvml.EnableDriver")
	sp.SetTag("component", "nvgpu-manager")
	sp.SetTag("span.kind", "server")
	defer sp.Finish()

	err := nvmlLib.doEnableDriver(services)
	utils.SpanCheckError(sp, err)
	return err
}

func checkUvmDevs() error {
	_, errDev := os.Stat(NvidiaUvmDev)
	_, errTools := os.Stat(NvidiaUvmToolsDev)

	if (errDev == nil && errTools == nil) || (os.IsExist(errDev) && os.IsExist(errTools)) {
		return nil
	}

	maj, err := utils.GetDevMaj(NvidiaUvmDevName)
	if err != nil {
		return fmt.Errorf("GetDevMaj() failed, err: %w", err)
	}

	devs := []struct {
		err  error
		path string
	}{
		{errDev, NvidiaUvmDev},
		{errTools, NvidiaUvmToolsDev},
	}

	for i, dev := range devs {
		if os.IsNotExist(dev.err) {
			err = syscall.Mknod(dev.path, syscall.S_IFCHR|NvidiaUvmDevsPerms, int(unix.Mkdev(uint32(maj), uint32(i))))
			if err != nil {
				return fmt.Errorf("failed to mknod %s, err: %w", dev.path, err)
			} else {
				ilog.Log().Info("created " + dev.path + " device with maj:min=" + strconv.Itoa(maj) + ":" + strconv.Itoa(i))

				err = os.Chmod(dev.path, NvidiaUvmDevsPerms)
				if err != nil {
					return fmt.Errorf("failed to chmod %s, err: %w", dev.path, err)
				}

				err = os.Chown(dev.path, 0, 0)
				if err != nil {
					return fmt.Errorf("failed to chown %s, err: %w", dev.path, err)
				}
			}
		}
	}

	return nil
}

// doEnableDriver load nvidia kernel driver
func (nvmlLib *NvmlLib) doEnableDriver(services []*utils.Service) error {
	var err error
	// Check if we module was blacklisted
	if _, err = os.Stat(NvidiaBlackListConfig); err == nil {
		err = os.Remove(NvidiaBlackListConfig)
		if err != nil {
			for _, service := range services {
				_ = service.Start()
			}
			return err
		}
	}

	err = modprobe.LoadModule("nvidia")

	for _, service := range services {
		err2 := service.Start()
		if err2 != nil {
			// Not critical error, try to move forward
			ilog.Log().Error("failed to enable service", zap.Error(err2))
		}
	}

	return err
}

// DisableDriver blacklist and unload nvidia kernel driver
func (nvmlLib *NvmlLib) DisableDriver(services []*utils.Service) error {
	sp := opentracing.StartSpan("nvml.DisableDriver")
	sp.SetTag("component", "nvgpu-manager")
	sp.SetTag("span.kind", "server")
	defer sp.Finish()
	err := nvmlLib.doDisableDriver(services)
	utils.SpanCheckError(sp, err)
	return err
}

// doDisableDriver blacklist and unload nvidia kernel driver
func (nvmlLib *NvmlLib) doDisableDriver(services []*utils.Service) error {
	var err error

	for _, mod := range NvidiaModuleList {
		for _, name := range mod.Names {
			err = modprobe.BlacklistModule(name, NvidiaBlackListConfig)
			if err != nil {
				_ = os.Remove(NvidiaBlackListConfig)
				return err
			}
		}
	}
	for _, service := range services {
		err = service.Stop()
		if err != nil {
			// Not critical error, try to move forward
			ilog.Log().Error("fail to disable service", zap.Error(err))
		}
	}

	for _, m := range NvidiaModuleList {
		if !modprobe.IsModuleLoaded(m.Alias) {
			continue
		}
		err = modprobe.UnloadModule(m.Alias)
		ilog.Log().Debug("UnloadModule", zap.String("module", m.Alias), zap.Error(err))
		if err != nil {
			if !modprobe.IsModuleLoaded(m.Alias) {
				// Some one does it for us
				continue
			}
			for _, service := range services {
				_ = service.Start()
			}
			_ = os.Remove(NvidiaBlackListConfig)
			return err
		}
	}
	// Validate that modules absence
	mlist := modprobe.LoadedModules()
	ilog.Log().Debug("Dump loaded modules", zap.Any("modules", mlist))
	for _, m := range NvidiaModuleList {
		t := modprobe.IsModuleLoaded(m.Alias)
		ilog.Log().Debug("Validate module state", zap.Any("module", m), zap.Bool("is_loaded", t))
		if t {
			for _, service := range services {
				_ = service.Start()
			}
			_ = os.Remove(NvidiaBlackListConfig)
			return errors.New("Module '" + m.Alias + "' still present")
		}
	}

	return nil
}
