resource "aws_sns_topic" "pagerduty_high" {
  count = "${length(var.pagerduty_endpoint["low"]) > 0? 1 : 0}"
  name = "${coalesce(var.common_name, null_resource.vars.triggers.cn)}-pagerduty-high"
}

resource "aws_sns_topic_subscription" "pagerduty_high" {
  count = "${length(var.pagerduty_endpoint["low"]) > 0? 1 : 0}"
  topic_arn = "${aws_sns_topic.pagerduty_high.arn}"
  endpoint = "${coalesce(var.pagerduty_endpoint["urgent"], var.pagerduty_endpoint["daytime"], var.pagerduty_endpoint["low"])}"
  protocol = "https"
  endpoint_auto_confirms = true
}

resource "aws_sns_topic" "pagerduty_daytime" {
  count = "${length(var.pagerduty_endpoint["low"]) > 0? 1 : 0}"
  name = "${coalesce(var.common_name, null_resource.vars.triggers.cn)}-pagerduty-daytime"
}

resource "aws_sns_topic_subscription" "pagerduty_daytime" {
  count = "${length(var.pagerduty_endpoint["low"]) > 0? 1 : 0}"
  topic_arn = "${aws_sns_topic.pagerduty_daytime.arn}"
  endpoint = "${coalesce(var.pagerduty_endpoint["daytime"], var.pagerduty_endpoint["low"])}"
  protocol = "https"
  endpoint_auto_confirms = true
}

resource "aws_sns_topic" "pagerduty_low" {
  count = "${length(var.pagerduty_endpoint["low"]) > 0? 1 : 0}"
  name = "${coalesce(var.common_name, null_resource.vars.triggers.cn)}-pagerduty-low"
}

resource "aws_sns_topic_subscription" "pagerduty_low" {
  count = "${length(var.pagerduty_endpoint["low"]) > 0? 1 : 0}"
  topic_arn = "${aws_sns_topic.pagerduty_low.arn}"
  endpoint = "${var.pagerduty_endpoint["low"]}"
  protocol = "https"
  endpoint_auto_confirms = true
}

resource "aws_cloudwatch_metric_alarm" "error_log" {
  count = "${length(var.pagerduty_endpoint["low"]) > 0? 1 : 0}"
  alarm_name = "${coalesce(var.common_name, null_resource.vars.triggers.cn)}-error-log"
  alarm_description = "Too many lines in the ERROR log streams for both envs."
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods = "1"
  metric_name = "ErrorCount"
  namespace = "Upload/${coalesce(var.common_name, null_resource.vars.triggers.cn)}"
  period = "300"
  statistic = "Sum"
  threshold = "1"
  # Cloudwatch doesn't emit 0s for this: when there are no error lines the metric will not be posted at all.
  treat_missing_data = "notBreaching"

  alarm_actions = ["${aws_sns_topic.pagerduty_high.arn}"]
}

resource "aws_cloudwatch_metric_alarm" "worker_disk_usage" {
  count = "${length(var.pagerduty_endpoint["low"]) > 0? 1 : 0}"
  alarm_name = "${coalesce(var.common_name, null_resource.vars.triggers.cn)}-disk-usage"
  alarm_description = "Worker env disk usage % alarm"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods = "5"
  metric_name = "RootFilesystemUtil"
  namespace = "AWS/ElasticBeanstalk"
  dimensions {
    EnvironmentName = "${coalesce(var.common_name, null_resource.vars.triggers.cn)}-worker-env"
  }
  period = "60"
  statistic = "Maximum"
  threshold = "75"

  alarm_actions = ["${aws_sns_topic.pagerduty_high.arn}"]
  ok_actions = ["${aws_sns_topic.pagerduty_high.arn}"]
}

variable "envs" {
  description = "tool to duplicate alarms across server and worker envs"
  type = "list"
  default = ["-server-env", "-worker-env"]
}

resource "aws_cloudwatch_metric_alarm" "beanstalk_health_high" {
  count = "${length(var.pagerduty_endpoint["low"]) > 0? length(var.envs) : 0}"
  alarm_name = "${coalesce(var.common_name, null_resource.vars.triggers.cn)}${var.envs[count.index]}-health-urgent"
  alarm_description = "Too many instances in severe state"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods = "5"
  metric_name = "InstancesSevere"
  namespace = "AWS/ElasticBeanstalk"
  dimensions = {
    EnvironmentName = "${coalesce(var.common_name, null_resource.vars.triggers.cn)}${var.envs[count.index]}"
  }
  period = "60"
  statistic = "Maximum"
  threshold = "2"

  alarm_actions = ["${aws_sns_topic.pagerduty_high.arn}"]
  ok_actions = ["${aws_sns_topic.pagerduty_high.arn}"]
}

variable "error_rate_threshold" {
  description = "We retry worker requests, so the error threhsold needs to be higher"
  type = "map"
  default = {
    "-server-env" = "1"
    "-worker-env" = "4"
  }
}

resource "aws_cloudwatch_metric_alarm" "http_5xx" {
  count = "${length(var.pagerduty_endpoint["low"]) > 0? length(var.envs): 0}"
  alarm_name = "${coalesce(var.common_name, null_resource.vars.triggers.cn)}${var.envs[count.index]}-5xx"
  alarm_description = "5xx rate in ${var.envs[count.index]}"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods = "5"
  metric_name = "ApplicationRequests5xx"
  namespace = "AWS/ElasticBeanstalk"
  dimensions = {
    EnvironmentName = "${coalesce(var.common_name, null_resource.vars.triggers.cn)}${var.envs[count.index]}"
  }
  period = "60"
  statistic = "Maximum"
  threshold = "${var.error_rate_threshold[var.envs[count.index]]}"

  alarm_actions = ["${aws_sns_topic.pagerduty_low.arn}"]
  ok_actions = ["${aws_sns_topic.pagerduty_low.arn}"]
}

# TODO: watch this in email, figure out the best way to monitor the dead-letter queue
resource "aws_cloudwatch_metric_alarm" "failed_request" {
  count = "${length(var.pagerduty_endpoint["low"]) > 0? 1 : 0}"
  alarm_name = "${coalesce(var.common_name, null_resource.vars.triggers.cn)}-failed-request"
  alarm_description = "Too many messages in the dead letter queue"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods = "5"
  metric_name = "ApproximateNumberOfMessagesVisible"
  namespace = "AWS/SQS"
  dimensions = {
    QueueName = "${coalesce(var.common_name, null_resource.vars.triggers.cn)}-dead-letter"
  }
  period = "60"
  statistic = "Maximum"
  threshold = "1"

  alarm_actions = ["${aws_sns_topic.pagerduty_low.arn}"]
  ok_actions = ["${aws_sns_topic.pagerduty_low.arn}"]
}

resource "aws_cloudwatch_metric_alarm" "dynamo_read_capacity" {
  count = "${length(var.pagerduty_endpoint["low"]) > 0? 1 : 0}"
  alarm_name = "${coalesce(var.common_name, null_resource.vars.triggers.cn)}-dynamo-read-capacity"
  alarm_description = "Dynamo read usage >= 75% of provisioned, reevaluate capacity"

  namespace = "AWS/DynamoDB"
  metric_name = "ConsumedReadCapacityUnits"
  statistic = "Sum"

  threshold = "${var.dynamo_capacity["read"] * 300 * 0.75}"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods = "12"
  period = "300"

  alarm_actions = ["${aws_sns_topic.pagerduty_daytime.arn}"]
  ok_actions = ["${aws_sns_topic.pagerduty_daytime.arn}"]
}

resource "aws_cloudwatch_metric_alarm" "dynamo_write_capacity" {
  count = "${length(var.pagerduty_endpoint["low"]) > 0? 1 : 0}"
  alarm_name = "${coalesce(var.common_name, null_resource.vars.triggers.cn)}-dynamo-write-capacity"
  alarm_description = "Dynamo write usage >= 75% of provisioned, reevaluate capacity"

  namespace = "AWS/DynamoDB"
  metric_name = "ConsumedReadCapacityUnits"
  statistic = "Sum"

  threshold = "${var.dynamo_capacity["write"] * 300 * 0.75}"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods = "12"
  period = "300"

  alarm_actions = ["${aws_sns_topic.pagerduty_daytime.arn}"]
  ok_actions = ["${aws_sns_topic.pagerduty_daytime.arn}"]
}

resource "aws_cloudwatch_metric_alarm" "dynamo_read_throttle" {
  count = "${length(var.pagerduty_endpoint["low"]) > 0? 1 : 0}"
  alarm_name = "${coalesce(var.common_name, null_resource.vars.triggers.cn)}-dynamo-read-throttle"
  alarm_description = "Dynamo reads are being throttled, increase capacity immediately"

  namespace = "AWS/DynamoDB"
  metric_name = "ReadThrottleEvents"
  statistic = "Sum"

  threshold = "10"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods = "5"
  period = "60"

  alarm_actions = ["${aws_sns_topic.pagerduty_high.arn}"]
  ok_actions = ["${aws_sns_topic.pagerduty_high.arn}"]
}

resource "aws_cloudwatch_metric_alarm" "dynamo_write_throttle" {
  count = "${length(var.pagerduty_endpoint["low"]) > 0? 1 : 0}"
  alarm_name = "${coalesce(var.common_name, null_resource.vars.triggers.cn)}-dynamo-write-throttle"
  alarm_description = "Dynamo writes are being throttled, increase capacity immediately"

  namespace = "AWS/DynamoDB"
  metric_name = "WriteThrottleEvents"
  statistic = "Sum"

  threshold = "10"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods = "5"
  period = "60"

  alarm_actions = ["${aws_sns_topic.pagerduty_high.arn}"]
  ok_actions = ["${aws_sns_topic.pagerduty_high.arn}"]
}
