# Contains configuration for alerting based on ECS Events

# Create an SNS Topic that will be responsible for getting Grid Reboot Alerts
resource "aws_sns_topic" "GridRebootError" {
  name = "Grid-Reboot-Container-Error"
  display_name = "Grid Reboot"
}

# Define a policy that allows SNS to get the Cloudwatch Events
resource "aws_sns_topic_policy" "GridRebootEventPolicy" {
  arn = "${aws_sns_topic.GridRebootError.arn}"
  policy = <<EOF
{
  "Version": "2012-10-17",
  "Id": "__default_policy_ID",
  "Statement": [
    {
      "Sid": "__default_statement_ID",
      "Effect": "Allow",
      "Principal": {
        "AWS": "*"
      },
      "Action": [
        "SNS:GetTopicAttributes",
        "SNS:SetTopicAttributes",
        "SNS:AddPermission",
        "SNS:RemovePermission",
        "SNS:DeleteTopic",
        "SNS:Subscribe",
        "SNS:ListSubscriptionsByTopic",
        "SNS:Publish",
        "SNS:Receive"
      ],
      "Resource": "${aws_sns_topic.GridRebootError.arn}",
      "Condition": {
        "StringEquals": {
          "AWS:SourceOwner": "${data.aws_caller_identity.current.account_id}"
        }
      }
    },
    {
      "Sid": "AWSEvents_GridRebootFailed_SendAsGridRebootError",
      "Effect": "Allow",
      "Principal": {
        "Service": "events.amazonaws.com"
      },
      "Action": "sns:Publish",
      "Resource": "${aws_sns_topic.GridRebootError.arn}"
    }
  ]
}
EOF
}

# Notify PagerDuty if Grid Reboot fails
resource "aws_sns_topic_subscription" "GridRebootNotify" {
  topic_arn = "${aws_sns_topic.GridRebootError.arn}"
  protocol  = "https"
  endpoint  = "${var.pagerduty_alert_subscription}"
  endpoint_auto_confirms = true
}

# Trigger a Cloudwatch Event if a Grid Reboot container exited with Code 1 (Failed)
resource "aws_cloudwatch_event_rule" "GridRebootFailedExit1" {
  name        = "GridRebootContainerExit1"
  description = "Triggered when the Grid Reboot container exited with code 1"

  event_pattern = <<EOF
{
  "source": [
    "aws.ecs"
  ],
  "detail-type": [
    "ECS Task State Change"
  ],
  "detail": {
    "clusterArn": [
        "${aws_ecs_cluster.GridReboot.arn}"
    ],
    "lastStatus": [
      "STOPPED"
    ],
    "containers": {
      "exitCode": [
        1
      ]
    },
    "stoppedReason": [
      "Essential container in task exited"
    ]
  }
}
EOF
}

# Trigger a Cloudwatch Event if a Grid Reboot container could not start
resource "aws_cloudwatch_event_rule" "GridRebootDidNotStart" {
  name        = "GridRebootFailedToStart"
  description = "Triggered when the Grid Reboot container failed to start"

  event_pattern = <<EOF
{
  "source": [
    "aws.ecs"
  ],
  "detail-type": [
    "ECS Task State Change"
  ],
  "detail": {
    "clusterArn": [
        "${aws_ecs_cluster.GridReboot.arn}"
    ],
    "lastStatus": [
      "STOPPED"
    ],
    "stoppedReason": [
      "Task failed to start"
    ]
  }
}
EOF
}

# Attach the event to the SNS Topic
resource "aws_cloudwatch_event_target" "GridReboot" {
  rule = "${aws_cloudwatch_event_rule.GridRebootFailedExit1.name}"
  arn  = "${aws_sns_topic.GridRebootError.arn}"
}

# Attach the event to the SNS Topic
resource "aws_cloudwatch_event_target" "GridRebootFailedToStart" {
  rule = "${aws_cloudwatch_event_rule.GridRebootDidNotStart.name}"
  arn  = "${aws_sns_topic.GridRebootError.arn}"
}

// Add an alarm that will alarm if execution result reports 0 (a failure)
resource "aws_cloudwatch_metric_alarm" "ExecutionFailure" {
  alarm_name                = "GridRebootExecutionFailure"
  comparison_operator       = "LessThanThreshold"
  evaluation_periods        = "1"
  metric_name               = "ExecutionResult"
  namespace                 = "Grid Reboot"
  period                    = "60" // 1 minute
  statistic                 = "Minimum" // If anything comes in at 0, make sure it alerts
  threshold                 = "1" // 1 = Pass
  alarm_description         = "This metric alerts when Grid Reboot's Execution fails (result 0)"
  treat_missing_data        = "missing"
  alarm_actions             = ["${aws_sns_topic.GridRebootError.arn}"]
  ok_actions                = ["${aws_sns_topic.GridRebootError.arn}"]
}

// NOTE: Unable to do - Cloudwatch does not currently support metrics that look for over 1 day
//resource "aws_cloudwatch_metric_alarm" "NoDataFailure" {
//  alarm_name                = "GridRebootNoRuns"
//  comparison_operator       = "LessThanThreshold"
//  metric_name               = "Invocations"
//  namespace                 = "Grid Reboot"
//  period                    = "86400" // 1 day
//  evaluation_periods        = "2" // Evaludate over 2 days
//  statistic                 = "Sum"
//  threshold                 = "1"
//  alarm_description         = "This metric alerts when no runs have occurred"
//  treat_missing_data        = "breaching"
//  insufficient_data_actions = ["${aws_sns_topic.GridRebootError.arn}"]
//}
