package servicepodalerts

import (
	"errors"
	"fmt"
	"strconv"
	"time"

	"go.temporal.io/sdk/temporal"
	"go.temporal.io/sdk/workflow"
	"gopkg.in/yaml.v2"

	pb "a.yandex-team.ru/infra/nanny/go/proto/nanny_repo"
	"a.yandex-team.ru/infra/temporal/activities/abc"
	"a.yandex-team.ru/infra/temporal/activities/io"
	"a.yandex-team.ru/infra/temporal/activities/nanny/pods"
	"a.yandex-team.ru/infra/temporal/activities/nanny/services"
	nannyClient "a.yandex-team.ru/infra/temporal/clients/nanny"
	"a.yandex-team.ru/library/go/core/buildinfo"
)

const ConfigFileNameEnvironmentVariable = "SERVICEPODALERTS_CONFIG_FILE_NAME"

type TimeoutConfig struct {
	YpRequestTimeout    time.Duration `yaml:"yp_request_timeout"`
	AbcRequestTimeout   time.Duration `yaml:"abc_request_timeout"`
	NannyRequestTimeout time.Duration `yaml:"nanny_request_timeout"`
}

type TicketConfig struct {
	NamespaceID                       string        `yaml:"namespace_id"`
	TaskQueue                         string        `yaml:"task_queue"`
	Queue                             string        `yaml:"queue"`
	Tags                              []string      `yaml:"tags"`
	RetryInvocationPeriod             time.Duration `yaml:"retry_invocation_period"`
	NannyScheduleID                   int           `yaml:"nanny_schedule_id"`
	MaxResponsibleToSummon            int           `yaml:"max_responsible_to_summon"`
	MaxFaultyPodsTableSize            int           `yaml:"max_faulty_pods_table_size"`
	MaxEvictionRequestedPodsTableSize int           `yaml:"max_eviction_requested_pods_table_size"`
	ManualEvictionPeriod              time.Duration `yaml:"manual_eviction_period"`
	BudgetLeftThreshold               float64       `yaml:"budget_left_threshold"`
	MinBudgetLeft                     int           `yaml:"min_budget_left"`
}

type Config struct {
	TaskQueue                      string                          `yaml:"task_queue"`
	PollPeriod                     time.Duration                   `yaml:"poll_period"`
	StatesStoreMinLimit            int                             `yaml:"states_store_min_limit"`
	StatesStoreMaxLimit            int                             `yaml:"states_store_max_limit"`
	PodsWindowCheckerConfig        *WindowCheckerConfig            `yaml:"pods_window_checker_config"`
	ServiceWindowCheckerConfig     *WindowCheckerConfig            `yaml:"service_window_checker_config"`
	EvictionRequestedCheckerConfig *EvictionRequestedCheckerConfig `yaml:"eviction_requested_checker_config"`
	TicketConfig                   *TicketConfig                   `yaml:"ticket_config"`
	TimeoutConfig                  *TimeoutConfig                  `yaml:"timeout_config"`
	NannyURL                       string                          `yaml:"nanny_url"`
	DeployURL                      string                          `yaml:"deploy_url"`
}

type ServiceInfo struct {
	InfoAttrs          *nannyClient.ServiceInfoAttrs
	ReplicationPolicy  *pb.ReplicationPolicy
	MaxUnavailablePods int
}

type Info struct {
	PodsInfo    map[string]*pods.PodInfo // key: HostName
	ServiceInfo *ServiceInfo
}

type ServiceState struct {
	SnapshotStatus pb.SnapshotStatus_Status
	ServiceStatus  string
}

type PodState struct {
	State       string
	Eviction    *pods.Eviction
	Maintenance *pods.Maintenance
}

type State struct {
	ServiceState   *ServiceState
	PodStates      map[string]*PodState // key: HostName
	Timestamp      time.Time
	TotalPodsCount int
}

type PodStatesController struct {
	cfg               *Config
	ctx               workflow.Context
	podActivities     *pods.Activities
	serviceActivities *services.Activities
	abcActivities     *abc.Activities
	// key: hostName
	serviceID        string
	checkers         []Checker
	states           []*State
	info             *Info
	ticketController *TicketController
}

type NoSnapshotsInServiceError struct {
	err string
}

func (e *NoSnapshotsInServiceError) Error() string {
	return e.err
}

func getConfig(ctx workflow.Context) (*Config, error) {
	ao := workflow.ActivityOptions{
		StartToCloseTimeout: time.Minute,
	}
	activityCtx := workflow.WithActivityOptions(ctx, ao)

	var data []byte
	err := workflow.ExecuteActivity(
		activityCtx,
		(&io.Activities{}).GetFileContent,
		ConfigFileNameEnvironmentVariable).Get(activityCtx, &data)
	if err != nil {
		return nil, err
	}

	var config Config
	err = yaml.Unmarshal(data, &config)
	if err != nil {
		return nil, err
	}

	return &config, nil
}

func (ctrl *PodStatesController) getServicePods() (*pods.PodsForServiceResponse, error) {
	ao := workflow.ActivityOptions{
		TaskQueue:           ctrl.cfg.TaskQueue,
		StartToCloseTimeout: ctrl.cfg.TimeoutConfig.YpRequestTimeout,
	}
	activityCtx := workflow.WithActivityOptions(ctrl.ctx, ao)

	faultyPods := []string{}
	if len(ctrl.states) != 0 {
		for podName := range ctrl.states[len(ctrl.states)-1].PodStates {
			faultyPods = append(faultyPods, podName)
		}
	}

	response := &pods.PodsForServiceResponse{}
	err := workflow.ExecuteActivity(
		activityCtx,
		ctrl.podActivities.GetPodsForServiceActivity,
		ctrl.serviceID,
		faultyPods).Get(activityCtx, &response)
	if err != nil {
		return nil, err
	}
	return response, nil
}

func (ctrl *PodStatesController) getServiceInfoAttrs() (*nannyClient.ServiceInfoAttrs, error) {
	ao := workflow.ActivityOptions{
		TaskQueue:           ctrl.cfg.TaskQueue,
		StartToCloseTimeout: ctrl.cfg.TimeoutConfig.NannyRequestTimeout,
		RetryPolicy: &temporal.RetryPolicy{
			NonRetryableErrorTypes: []string{"ServiceNotFoundError"},
		},
	}
	activityCtx := workflow.WithActivityOptions(ctrl.ctx, ao)
	infoAttrs := nannyClient.ServiceInfoAttrs{}
	err := workflow.ExecuteActivity(
		activityCtx,
		ctrl.serviceActivities.GetServiceInfoAttrsActivity,
		ctrl.serviceID).Get(activityCtx, &infoAttrs)
	if err != nil {
		return nil, err
	}
	return &infoAttrs, nil
}

func (ctrl *PodStatesController) getServiceReplicationPolicy() (*pb.ReplicationPolicy, error) {
	ao := workflow.ActivityOptions{
		TaskQueue:           ctrl.cfg.TaskQueue,
		StartToCloseTimeout: ctrl.cfg.TimeoutConfig.NannyRequestTimeout,
		RetryPolicy: &temporal.RetryPolicy{
			NonRetryableErrorTypes: []string{"ServiceNotFoundError"},
		},
	}
	activityCtx := workflow.WithActivityOptions(ctrl.ctx, ao)
	replicationPolicy := pb.ReplicationPolicy{}
	err := workflow.ExecuteActivity(
		activityCtx,
		ctrl.serviceActivities.GetServiceReplicationPolicy,
		ctrl.serviceID).Get(activityCtx, &replicationPolicy)
	if err != nil {
		return nil, err
	}
	return &replicationPolicy, nil
}

func (ctrl *PodStatesController) getServiceState() (*ServiceState, error) {
	ao := workflow.ActivityOptions{
		TaskQueue:           ctrl.cfg.TaskQueue,
		StartToCloseTimeout: ctrl.cfg.TimeoutConfig.NannyRequestTimeout,
	}
	activityCtx := workflow.WithActivityOptions(ctrl.ctx, ao)
	var service *pb.Service
	err := workflow.ExecuteActivity(
		activityCtx,
		ctrl.serviceActivities.GetService,
		ctrl.serviceID).Get(activityCtx, &service)
	if err != nil {
		return nil, err
	}

	status := service.GetStatus().GetSummary().GetValue()
	if status == "" {
		return nil, errors.New("cannot fetch service status")
	}

	snapshots := service.GetStatus().GetSnapshot()
	if len(snapshots) == 0 {
		return nil, &NoSnapshotsInServiceError{}
	}
	snapshotStatus := snapshots[0].GetStatus()
	if status == "" {
		return nil, errors.New("snapshot has no status")
	}

	return &ServiceState{
		ServiceStatus:  status,
		SnapshotStatus: snapshotStatus,
	}, nil
}

func (ctrl *PodStatesController) cleanState(decision *Decision) {
	if len(ctrl.states) > ctrl.cfg.StatesStoreMaxLimit {
		// shrink states size to StatesStoreMinLimit
		ctrl.states = ctrl.states[len(ctrl.states)-ctrl.cfg.StatesStoreMinLimit:]
	}

	// delete pods from podInfo and all the states
	for _, podName := range decision.Pods.HealthyPodNames {
		delete(ctrl.info.PodsInfo, podName)

		for _, state := range ctrl.states {
			delete(state.PodStates, podName)
		}
	}
}

func (ctrl *PodStatesController) populateStateWithPod(pod *pods.Pod, state *State) {
	podInfo, podInfoExists := ctrl.info.PodsInfo[pod.PodInfo.HostName]
	if !podInfoExists && pod.State == pods.ActiveState {
		return
	}

	if !podInfoExists {
		podInfo = &pods.PodInfo{}
		ctrl.info.PodsInfo[pod.PodInfo.HostName] = podInfo
	}
	*podInfo = *pod.PodInfo

	state.PodStates[podInfo.HostName] = &PodState{
		State:       pod.State,
		Eviction:    pod.Eviction,
		Maintenance: pod.Maintenance,
	}
}

func (ctrl *PodStatesController) updateState() error {
	newPodStates, err := ctrl.getServicePods()
	if err != nil {
		return err
	}

	state := State{
		ServiceState:   &ServiceState{},
		PodStates:      make(map[string]*PodState),
		TotalPodsCount: newPodStates.TotalPodsCount,
	}

	for _, pod := range newPodStates.Pods {
		ctrl.populateStateWithPod(pod, &state)
	}

	serviceState, err := ctrl.getServiceState()
	if err != nil {
		return err
	}
	state.ServiceState = serviceState

	state.Timestamp = workflow.Now(ctrl.ctx)

	ctrl.states = append(ctrl.states, &state)

	return nil
}

func (ctrl *PodStatesController) getMaxUnavailablePods() (int, error) {
	podsCount := ctrl.states[len(ctrl.states)-1].TotalPodsCount

	spec := ctrl.info.ServiceInfo.ReplicationPolicy.GetSpec()

	if spec.GetPodGroupIdPath() != "" {
		// we always want to notify sharded services
		return 0, nil
	}

	var absolute int
	var percent float64
	switch spec.GetDisruptionBudgetKind() {
	case pb.ReplicationPolicySpec_ABSOLUTE:
		return int(spec.MaxUnavailable), nil
	case pb.ReplicationPolicySpec_MIXED:
		absolute = int(spec.MaxUnavailable)
		percent = 0.05
	case pb.ReplicationPolicySpec_PERCENT:
		absolute = 1
		percent = float64(spec.MaxUnavailablePercent) / 100
	default:
		return 0, fmt.Errorf("unknown disruption budget kind")
	}

	unavailable := int(percent * float64(podsCount))
	if unavailable > absolute {
		return unavailable, nil
	}
	return absolute, nil
}

func (ctrl *PodStatesController) runIteration() error {
	infoAttrs, err := ctrl.getServiceInfoAttrs()
	if err != nil {
		return fmt.Errorf("unable to get info attrs: %w", err)
	}
	if infoAttrs.EnvType == nannyClient.EnvTypeTesting {
		if len(ctrl.states) > 0 {
			// service will be testing for a while, we don't need it's old state
			ctrl.states = nil
		}
		err = ctrl.ticketController.closeStartrekerWorkflowIfExists(CloseReasonNowTestingText)
		if err != nil {
			return fmt.Errorf("unable to close startreker workflow for service with testing env: %w", err)
		}
		return nil
	}
	ctrl.info.ServiceInfo.InfoAttrs = infoAttrs

	replicationPolicy, err := ctrl.getServiceReplicationPolicy()
	if err != nil {
		return fmt.Errorf("unable to get replication policy: %w", err)
	}
	ctrl.info.ServiceInfo.ReplicationPolicy = replicationPolicy

	err = ctrl.updateState()
	if err != nil {
		return fmt.Errorf("error during state updating: %w", err)
	}

	maxUnavailablePods, err := ctrl.getMaxUnavailablePods()
	if err != nil {
		return fmt.Errorf("unable to calculate max unavailable pods: %w", err)
	}
	ctrl.info.ServiceInfo.MaxUnavailablePods = maxUnavailablePods

	decision, err := runChecks(ctrl.states, ctrl.checkers, ctrl.info, ctrl.serviceID)
	if err != nil {
		return fmt.Errorf("error during checks run: %w", err)
	}
	ctrl.cleanState(decision)

	if !decision.Pods.Faulty && len(decision.Pods.EvictionRequestedPodNames) == 0 {
		err = ctrl.ticketController.closeStartrekerWorkflowIfExists(CloseReasonAllOk)
		if err != nil {
			return fmt.Errorf("unable to close startreker workflow for service without notifications: %w", err)
		}
		return nil
	}

	err = ctrl.ticketController.CreateOrUpdateTicket(decision, ctrl.info)
	if err != nil {
		return fmt.Errorf("ticket controller failed to create or update ticket: %w", err)
	}

	return nil
}

func getCurrentVersion() workflow.Version {
	version, err := strconv.Atoi(buildinfo.Info.BuildTimestamp)
	if err != nil {
		panic(fmt.Errorf("cannot parse build timestamp from buildinfo: %w", err))
	}
	return workflow.Version(version)
}

type WorkflowState struct {
	ServiceID           string
	Config              *Config // for backwards compatability
	StartrekerExecution *StartrekerExecution
	States              []*State
	Info                *Info
}

func ServicePodAlertsWorkflow(ctx workflow.Context, state *WorkflowState) error {
	currentVersion := getCurrentVersion()
	version := workflow.GetVersion(ctx, "WorkflowVersioning", workflow.DefaultVersion, currentVersion)
	if version != currentVersion {
		return workflow.NewContinueAsNewError(ctx, ServicePodAlertsWorkflow, state)
	}

	logger := workflow.GetLogger(ctx)
	logger.Info(fmt.Sprintf("starting workflow for service %s, version=%d", state.ServiceID, currentVersion))

	cfg, err := getConfig(ctx)
	if err != nil {
		return err
	}

	if state.Info == nil {
		state.Info = &Info{
			ServiceInfo: &ServiceInfo{},
			PodsInfo:    make(map[string]*pods.PodInfo),
		}
	}

	ctrl := &PodStatesController{
		ctx:       ctx,
		cfg:       cfg,
		serviceID: state.ServiceID,
		states:    state.States,
		info:      state.Info,
		checkers: []Checker{
			&PodsWindowChecker{cfg.PodsWindowCheckerConfig},
			&ServiceWindowChecker{cfg.ServiceWindowCheckerConfig},
			&EvictionRequestedChecker{cfg.EvictionRequestedCheckerConfig},
		},
		ticketController: &TicketController{
			serviceID:           state.ServiceID,
			ctx:                 ctx,
			startrekerExec:      state.StartrekerExecution,
			timeoutConfig:       cfg.TimeoutConfig,
			activitiesTaskQueue: cfg.TaskQueue,
			cfg: &TicketControllerConfig{
				NamespaceID:                       cfg.TicketConfig.NamespaceID,
				TaskQueue:                         cfg.TicketConfig.TaskQueue,
				Queue:                             cfg.TicketConfig.Queue,
				Tags:                              cfg.TicketConfig.Tags,
				RetryInvocationPeriod:             cfg.TicketConfig.RetryInvocationPeriod,
				NannyScheduleID:                   cfg.TicketConfig.NannyScheduleID,
				MaxResponsibleToSummon:            cfg.TicketConfig.MaxResponsibleToSummon,
				MaxFaultyPodsTableSize:            cfg.TicketConfig.MaxFaultyPodsTableSize,
				MaxEvictionRequestedPodsTableSize: cfg.TicketConfig.MaxEvictionRequestedPodsTableSize,
				ManualEvictionPeriod:              cfg.TicketConfig.ManualEvictionPeriod,
				BudgetLeftThreshold:               cfg.TicketConfig.BudgetLeftThreshold,
				MinBudgetLeft:                     cfg.TicketConfig.MinBudgetLeft,

				NannyURL:                 cfg.NannyURL,
				DeployURL:                cfg.DeployURL,
				PollPeriod:               cfg.PollPeriod,
				EvictionExpiredThreshold: cfg.EvictionRequestedCheckerConfig.EvictionExpiredThreshold,
			},
		},
	}

	err = ctrl.runIteration()
	var noSnapErr *NoSnapshotsInServiceError
	if errors.As(err, &noSnapErr) {
		logger.Info("service has no snapshots")
	} else {
		var appErr *temporal.ApplicationError
		if errors.As(err, &appErr) && appErr.Type() == "ServiceNotFoundError" {
			err := ctrl.ticketController.closeStartrekerWorkflowIfExists(CloseReasonServiceDeleted)
			if err != nil {
				return err
			}
			logger.Info("workflow successfully finished")
			return nil
		}
		if err != nil {
			logger.Error("error during iteration: %w", err)
		}
	}

	err = workflow.Sleep(ctx, ctrl.cfg.PollPeriod)
	if err != nil {
		logger.Error("sleep error: %w", err)
	}

	return workflow.NewContinueAsNewError(ctx, ServicePodAlertsWorkflow, &WorkflowState{
		ServiceID:           ctrl.serviceID,
		Config:              ctrl.cfg,
		StartrekerExecution: ctrl.ticketController.startrekerExec,
		States:              ctrl.states,
		Info:                ctrl.info,
	})
}
