resource "aws_sns_topic" "alerts" {
  name = "${var.name}-${var.environment}-alerts"
}

resource "aws_sns_topic_subscription" "pagerduty" {
  topic_arn              = "${aws_sns_topic.alerts.arn}"
  protocol               = "https"
  endpoint               = "${var.pagerduty_endpoint}"
  endpoint_auto_confirms = true
}

resource "aws_cloudwatch_metric_alarm" "kinesis-iterator-age" {
  alarm_name        = "${var.name}-${var.environment}-kinesis-iterator-age"
  alarm_description = "Records older than 30 minutes are queued in the Kinesis stream"

  namespace = "AWS/Kinesis"

  dimensions {
    StreamName = "${var.kinesis_stream_name}"
  }

  metric_name = "GetRecords.IteratorAgeMilliseconds"

  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = "1"
  period              = "300"
  statistic           = "Minimum"
  threshold           = "1800000"                       # 30 minutes

  alarm_actions = ["${aws_sns_topic.alerts.arn}"]
  ok_actions    = ["${aws_sns_topic.alerts.arn}"]
}

resource "aws_cloudwatch_metric_alarm" "kinesis-read-exceeded" {
  alarm_name        = "${var.name}-${var.environment}-kinesis-read-exceeded"
  alarm_description = "Reads are being throttled in the Kinesis stream"

  namespace = "AWS/Kinesis"

  dimensions {
    StreamName = "${var.kinesis_stream_name}"
  }

  metric_name = "ReadProvisionedThroughputExceeded"

  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = "1"
  period              = "300"
  statistic           = "Average"
  threshold           = "0.5"                           # half of all reads were throttled

  alarm_actions = ["${aws_sns_topic.alerts.arn}"]
  ok_actions    = ["${aws_sns_topic.alerts.arn}"]
}

resource "aws_cloudwatch_metric_alarm" "kinesis-write-exceeded" {
  alarm_name        = "${var.name}-${var.environment}-kinesis-write-exceeded"
  alarm_description = "Writes are being throttled in the Kinesis stream"

  namespace = "AWS/Kinesis"

  dimensions {
    StreamName = "${var.kinesis_stream_name}"
  }

  metric_name = "WriteProvisionedThroughputExceeded"

  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = "1"
  period              = "300"
  statistic           = "Average"
  threshold           = "0.5"                           # half of all writes were throttled

  alarm_actions = ["${aws_sns_topic.alerts.arn}"]
  ok_actions    = ["${aws_sns_topic.alerts.arn}"]
}

resource "aws_cloudwatch_metric_alarm" "event-store-iterator-age" {
  alarm_name        = "${var.name}-${var.environment}-event-store-iterator-age"
  alarm_description = "Records older than 30 minutes are queued for event_store Lambda"

  namespace = "AWS/Lambda"

  dimensions {
    FunctionName = "${var.event_store_lambda_name}"
  }

  metric_name = "IteratorAge"

  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = "1"
  period              = "300"
  statistic           = "Minimum"
  threshold           = "1800000"                       # 30 minutes

  alarm_actions = ["${aws_sns_topic.alerts.arn}"]
  ok_actions    = ["${aws_sns_topic.alerts.arn}"]
}

resource "aws_cloudwatch_metric_alarm" "event-store-errors" {
  alarm_name        = "${var.name}-${var.environment}-event-store-errors"
  alarm_description = "event_store Lambda is reporting invocation errors"

  namespace = "AWS/Lambda"

  dimensions {
    FunctionName = "${var.event_store_lambda_name}"
  }

  metric_name = "Errors"

  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "2"
  period              = "300"                  # 5 minutes
  statistic           = "Sum"
  threshold           = "1"

  alarm_actions = ["${aws_sns_topic.alerts.arn}"]
  ok_actions    = ["${aws_sns_topic.alerts.arn}"]
}

resource "aws_cloudwatch_metric_alarm" "blueprint-ingest-iterator-age" {
  alarm_name        = "${var.name}-${var.environment}-blueprint-ingest-iterator-age"
  alarm_description = "Records older than 30 minutes are queued for blueprint_ingest Lambda"

  namespace = "AWS/Lambda"

  dimensions {
    FunctionName = "${var.blueprint_ingest_lambda_name}"
  }

  metric_name = "IteratorAge"

  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = "1"
  period              = "300"
  statistic           = "Minimum"
  threshold           = "1800000"                       # 30 minutes

  alarm_actions = ["${aws_sns_topic.alerts.arn}"]
  ok_actions    = ["${aws_sns_topic.alerts.arn}"]
}

resource "aws_cloudwatch_metric_alarm" "blueprint-ingest-errors" {
  alarm_name        = "${var.name}-${var.environment}-blueprint-ingest-errors"
  alarm_description = "blueprint_ingest Lambda is reporting invocation errors"

  namespace = "AWS/Lambda"

  dimensions {
    FunctionName = "${var.blueprint_ingest_lambda_name}"
  }

  metric_name = "Errors"

  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "2"
  period              = "300"                  # 5 minutes
  statistic           = "Sum"
  threshold           = "1"

  alarm_actions = ["${aws_sns_topic.alerts.arn}"]
  ok_actions    = ["${aws_sns_topic.alerts.arn}"]
}

resource "aws_cloudwatch_metric_alarm" "github-stats-iterator-age" {
  alarm_name        = "${var.name}-${var.environment}-github-stats-iterator-age"
  alarm_description = "Records older than 30 minutes are queued for github_stats Lambda"

  namespace = "AWS/Lambda"

  dimensions {
    FunctionName = "${var.github_stats_lambda_name}"
  }

  metric_name = "IteratorAge"

  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = "1"
  period              = "300"
  statistic           = "Minimum"
  threshold           = "1800000"                       # 30 minutes

  alarm_actions = ["${aws_sns_topic.alerts.arn}"]
  ok_actions    = ["${aws_sns_topic.alerts.arn}"]
}

resource "aws_cloudwatch_metric_alarm" "github-stats-errors" {
  alarm_name        = "${var.name}-${var.environment}-github-stats-errors"
  alarm_description = "github_stats Lambda is reporting invocation errors"

  namespace = "AWS/Lambda"

  dimensions {
    FunctionName = "${var.github_stats_lambda_name}"
  }

  metric_name = "Errors"

  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "2"
  period              = "300"                  # 5 minutes
  statistic           = "Sum"
  threshold           = "1"

  alarm_actions = ["${aws_sns_topic.alerts.arn}"]
  ok_actions    = ["${aws_sns_topic.alerts.arn}"]
}

resource "aws_cloudwatch_metric_alarm" "jira-scraper-errors" {
  alarm_name        = "${var.name}-${var.environment}-jira-scraper-errors"
  alarm_description = "JIRA Scraper Lambda is reporting invocation errors"

  namespace = "AWS/Lambda"

  dimensions {
    FunctionName = "${var.jira_scraper_lambda_name}"
  }

  metric_name = "Errors"

  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = "2"
  period              = "3600"                          # 1 hour
  statistic           = "Sum"
  threshold           = "1"

  alarm_actions = ["${aws_sns_topic.alerts.arn}"]
  ok_actions    = ["${aws_sns_topic.alerts.arn}"]
}

resource "aws_cloudwatch_metric_alarm" "github-webhook-errors" {
  alarm_name        = "${var.name}-${var.environment}-github-webhook-errors"
  alarm_description = "GitHub Webhook Lambda is reporting invocation errors"

  namespace = "AWS/Lambda"

  dimensions {
    FunctionName = "${var.github_webhook_lambda_name}"
  }

  metric_name = "Errors"

  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "1"
  period              = "3600"                 # 1 hour
  statistic           = "Sum"
  threshold           = "1"

  alarm_actions = ["${aws_sns_topic.alerts.arn}"]
  ok_actions    = ["${aws_sns_topic.alerts.arn}"]
}

resource "aws_cloudwatch_metric_alarm" "worker-queue-message-age" {
  alarm_name        = "${var.name}-${var.environment}-worker-queue-message-age"
  alarm_description = "Messages older than 30 minutes are queued in the worker queue"

  namespace = "AWS/SQS"

  dimensions {
    QueueName = "${var.worker_queue_name}"
  }

  metric_name = "ApproximateAgeOfOldestMessage"

  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = "1"
  period              = "300"
  statistic           = "Minimum"
  threshold           = "1800"                          # 30 minutes

  alarm_actions = ["${aws_sns_topic.alerts.arn}"]
  ok_actions    = ["${aws_sns_topic.alerts.arn}"]
}

resource "aws_cloudwatch_metric_alarm" "environment-health" {
  alarm_name        = "${var.name}-${var.environment}-environment-health"
  alarm_description = "Beanstalk environment has Severe health"

  namespace = "AWS/ElasticBeanstalk"

  dimensions {
    EnvironmentName = "${var.eb_environment_name}"
  }

  metric_name = "EnvironmentHealth"

  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = "1"
  period              = "300"
  statistic           = "Minimum"
  threshold           = "25"                            # Severe

  alarm_actions = ["${aws_sns_topic.alerts.arn}"]
  ok_actions    = ["${aws_sns_topic.alerts.arn}"]
}

resource "aws_cloudwatch_metric_alarm" "github-webhook-5xx-errors" {
  alarm_name        = "${var.name}-${var.environment}-github-webhook-5xx-errors"
  alarm_description = "GitHub webhook receiver reporting 5xx errors"

  namespace = "AWS/ApiGateway"

  dimensions {
    ApiName = "${var.github_webhook_api_name}"
    Stage   = "${var.github_webhook_stage_name}"
  }

  metric_name = "5XXError"

  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "1"
  period              = "3600"
  statistic           = "Sum"
  threshold           = "1"

  alarm_actions = ["${aws_sns_topic.alerts.arn}"]
  ok_actions    = ["${aws_sns_topic.alerts.arn}"]
}

resource "aws_cloudwatch_metric_alarm" "github-webhook-4xx-errors" {
  alarm_name        = "${var.name}-${var.environment}-github-webhook-4xx-errors"
  alarm_description = "GitHub webhook receiver reporting 4xx errors"

  namespace = "AWS/ApiGateway"

  dimensions {
    ApiName = "${var.github_webhook_api_name}"
    Stage   = "${var.github_webhook_stage_name}"
  }

  metric_name = "4XXError"

  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "1"
  period              = "3600"
  statistic           = "Sum"
  threshold           = "1"

  alarm_actions = ["${aws_sns_topic.alerts.arn}"]
  ok_actions    = ["${aws_sns_topic.alerts.arn}"]
}

resource "aws_cloudwatch_metric_alarm" "projects-ReadCapacityUnitsLimit-BasicAlarm" {
  alarm_name        = "${var.name}-${var.environment}-projects-ReadCapacityUnitsLimit-BasicAlarm"
  alarm_description = "Projects table is >= 80% read capacity for 5 minutes"

  namespace = "AWS/DynamoDB"

  dimensions {
    TableName = "${var.projects_table_name}"
  }

  metric_name = "ConsumedReadCapacityUnits"

  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = "5"
  period              = "60"
  statistic           = "Sum"
  threshold           = "${var.projects_table_read_capacity * 48}"

  alarm_actions = ["${aws_sns_topic.alerts.arn}"]
  ok_actions    = ["${aws_sns_topic.alerts.arn}"]
}

resource "aws_cloudwatch_metric_alarm" "projects-WriteCapacityUnitsLimit-BasicAlarm" {
  alarm_name        = "${var.name}-${var.environment}-projects-WriteCapacityUnitsLimit-BasicAlarm"
  alarm_description = "Projects table is >= 80% write capacity for 5 minutes"

  namespace = "AWS/DynamoDB"

  dimensions {
    TableName = "${var.projects_table_name}"
  }

  metric_name = "ConsumedWriteCapacityUnits"

  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = "5"
  period              = "60"
  statistic           = "Sum"
  threshold           = "${var.projects_table_write_capacity * 48}"

  alarm_actions = ["${aws_sns_topic.alerts.arn}"]
  ok_actions    = ["${aws_sns_topic.alerts.arn}"]
}
