data "aws_alb" "dashboard_alb" {
  arn = "${var.dashboard_alb_arn}"
}

# Since the overall throughput is low, we should monitor larger windows
# of request ingest and alarm if any single one has a high error rate.
# Alarming on consistent periods of error rates over a threshold could
# cause false negatives because if no requests come in during followup windows
# which would incorrectly flag the alarm as OK.
resource "aws_cloudwatch_metric_alarm" "dashboard_5xx_rate" {
  alarm_name                = "dashboard-5xx-rate"
  comparison_operator       = "GreaterThanOrEqualToThreshold"
  evaluation_periods        = "1"
  threshold                 = "5"
  alarm_description         = "dashboard has exceeded 5 5xx's over 5 minute window"
  insufficient_data_actions = []
  ok_actions                = ["${module.pagerduty.pd_high_urgency_arn}"]
  alarm_actions             = ["${module.pagerduty.pd_high_urgency_arn}"]

  metric_query {
    id          = "errors"
    return_data = true

    metric {
      metric_name = "HTTPCode_Target_5XX_Count"
      namespace   = "AWS/ApplicationELB"
      period      = "300"
      stat        = "Sum"
      unit        = "Count"

      dimensions = {
        LoadBalancer = "${data.aws_alb.dashboard_alb.arn_suffix}"
      }
    }
  }
}

resource "aws_cloudwatch_metric_alarm" "dashboard_aggregate_latency" {
  alarm_name                = "dashboard-aggregate-latency"
  comparison_operator       = "GreaterThanOrEqualToThreshold"
  evaluation_periods        = "3"
  datapoints_to_alarm       = "3"
  threshold                 = "5"
  alarm_description         = "dashboard p99 latency has exceeded SLA"
  insufficient_data_actions = []
  ok_actions                = ["${module.pagerduty.pd_low_urgency_arn}"]
  alarm_actions             = ["${module.pagerduty.pd_low_urgency_arn}"]

  metric_query {
    id          = "latency"
    return_data = true

    metric {
      metric_name = "TargetResponseTime"
      namespace   = "AWS/ApplicationELB"
      period      = "300"
      stat        = "p99"
      unit        = "Seconds"

      dimensions = {
        LoadBalancer = "${data.aws_alb.dashboard_alb.arn_suffix}"
      }
    }
  }
}

# We use a high value for single request latency becuase requests to the service catalog may be extremely slow due to lambda cold start
resource "aws_cloudwatch_metric_alarm" "dashboard_single_request_latency" {
  alarm_name                = "dashboard-single-request-latency"
  comparison_operator       = "GreaterThanOrEqualToThreshold"
  evaluation_periods        = "1"
  threshold                 = "30"
  alarm_description         = "dashboard p99 latency has exceeded SLA"
  insufficient_data_actions = []
  ok_actions                = ["${module.pagerduty.pd_low_urgency_arn}"]
  alarm_actions             = ["${module.pagerduty.pd_low_urgency_arn}"]

  metric_query {
    id          = "latency"
    return_data = true

    metric {
      metric_name = "TargetResponseTime"
      namespace   = "AWS/ApplicationELB"
      period      = "300"
      stat        = "p99"
      unit        = "Seconds"

      dimensions = {
        LoadBalancer = "${data.aws_alb.dashboard_alb.arn_suffix}"
      }
    }
  }
}
