module "puppet_sns_topic" {
  source       = "./vendor/modules/sns-alarm"
  service_name = var.service
}

# The metric basically always returns 0 except for two cases
# 1. cluster under load
# 2. ASG cycling instances
resource "aws_cloudwatch_metric_alarm" "unhealthy_hosts" {
  alarm_name          = "${var.service}_${var.env}_${var.region}_instances"
  alarm_description   = "${var.service} ${var.env} ${var.region} has unhealthy instances"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "6"
  datapoints_to_alarm = "2"
  metric_name         = "UnHealthyHostCount"
  namespace           = "AWS/NetworkELB"
  period              = "300"
  statistic           = "Average"
  threshold           = "0"
  dimensions = {
    TargetGroup  = aws_lb_target_group.puppet_tg.arn_suffix
    LoadBalancer = aws_lb.puppet.arn_suffix
  }
}

# This anomaly band has only went off for the cluster during noticeable, cluster-wide impact
resource "aws_cloudwatch_metric_alarm" "cluster_cpu_anomaly" {
  alarm_name          = "${var.service}_${var.env}_${var.region}_CPU_anomaly"
  comparison_operator = "GreaterThanUpperThreshold"
  evaluation_periods  = "6"
  datapoints_to_alarm = "2"
  threshold_metric_id = "e1"
  alarm_description   = "${var.service} ${var.env} ${var.region} CPU Utilization anomaly detected."

  metric_query {
    id          = "e1"
    expression  = "ANOMALY_DETECTION_BAND(m1)"
    label       = "CPUUtilization (Expected)"
    return_data = "true"
  }

  metric_query {
    id          = "m1"
    return_data = "true"
    metric {
      metric_name = "CPUUtilization"
      namespace   = "AWS/EC2"
      period      = "300"
      stat        = "Average"
      dimensions = {
        AutoScalingGroupName = "${var.service}_asg"
      }
    }
  }
}

# This anomaly band has only went off for the cluster during noticeable, cluster-wide impact
resource "aws_cloudwatch_metric_alarm" "cluster_cpu_threshold" {
  alarm_name          = "${var.service}_${var.env}_${var.region}_CPU_threshold"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "6"
  datapoints_to_alarm = "2"
  alarm_description   = "${var.service} ${var.env} ${var.region} CPU Utilization breached."
  metric_name         = "CPUUtilization"
  namespace           = "AWS/EC2"
  period              = "300"
  statistic           = "Average"
  threshold           = "30"
  dimensions = {
    AutoScalingGroupName = "${var.service}_asg"
  }
}

locals {
  composite_alarm = <<EOF
ALARM(${aws_cloudwatch_metric_alarm.unhealthy_hosts.alarm_name}) AND
ALARM(${aws_cloudwatch_metric_alarm.cluster_cpu_anomaly.alarm_name}) AND
ALARM(${aws_cloudwatch_metric_alarm.cluster_cpu_threshold.alarm_name})
EOF
}

# Gate the two alarms together to remove noise from the ASG cycling instances.
resource "aws_cloudwatch_composite_alarm" "example" {
  alarm_description = "${var.service} ${var.env} ${var.region} service impacted. Runbook: https://wiki.xarth.tv/display/VID/Video+Puppet+Master+Runbook#VideoPuppetMasterRunbook-CloudwatchAlarm"
  alarm_name        = "${var.service}_${var.env}_${var.region}_alarm"

  alarm_actions   = [module.puppet_sns_topic.business_hours_topic_arn]
  ok_actions      = [module.puppet_sns_topic.business_hours_topic_arn]
  actions_enabled = var.enable_monitoring
  alarm_rule      = trimspace(local.composite_alarm)
}
