locals {
  namespace                         = "eventbus/logs"
  max_warnings                      = 50
  max_errors                        = 1
  max_unhealthy_hosts               = 1
  max_p99_latency                   = 0.5 #seconds
  high_priority_period              = 180 #minutes
  high_priority_datapoints_to_alarm = 3

  metric_names = {
    controlplane_httpserver = {
      error   = "ControlplaneHttpserverErrorsCount"
      warning = "ControlplaneHttpserverWarningsCount"
    }
    controlplane_converger = {
      error   = "ControlplaneConvergerErrorsCount"
      warning = "ControlplaneConvergerWarningsCount"
    }
    dashboard = {
      error   = "DashboardErrorsCount"
      warning = "DashboardWarningsCount"
    }
  }
}

module "pagerduty" {
  source = "../pagerduty"

  environment = var.environment
}

resource "aws_cloudwatch_log_metric_filter" "controlplane_httpserver_warnings" {
  log_group_name = aws_cloudwatch_log_group.controlplane_httpserver_logs.name
  name           = "eventbus-controlplane-httpserver-warnings-${var.environment}"
  pattern        = "{$.level = \"warning\"}"

  metric_transformation {
    name      = local.metric_names["controlplane_httpserver"]["warning"]
    namespace = local.namespace
    value     = "1"
  }
}

resource "aws_cloudwatch_log_metric_filter" "controlplane_httpserver_errors" {
  log_group_name = aws_cloudwatch_log_group.controlplane_httpserver_logs.name
  name           = "eventbus-controlplane-httpserver-errors-${var.environment}"
  pattern        = "{$.level = \"error\" || $.level = \"fatal\" || $.level = \"panic\"}"

  metric_transformation {
    name      = local.metric_names["controlplane_httpserver"]["error"]
    namespace = local.namespace
    value     = "1"
  }
}

resource "aws_cloudwatch_log_metric_filter" "controlplane_converger_warnings" {
  log_group_name = aws_cloudwatch_log_group.controlplane_converger_logs.name
  name           = "eventbus-controlplane-converger-warnings-${var.environment}"
  pattern        = "{$.level = \"warning\"}"

  metric_transformation {
    name      = local.metric_names["controlplane_converger"]["warning"]
    namespace = local.namespace
    value     = "1"
  }
}

resource "aws_cloudwatch_log_metric_filter" "controlplane_converger_errors" {
  log_group_name = aws_cloudwatch_log_group.controlplane_converger_logs.name
  name           = "eventbus-controlplane-converger-errors-${var.environment}"
  pattern        = "{$.level = \"error\" || $.level = \"fatal\" || $.level = \"panic\"}"

  metric_transformation {
    name      = local.metric_names["controlplane_converger"]["error"]
    namespace = local.namespace
    value     = "1"
  }
}

resource "aws_cloudwatch_log_metric_filter" "dashboard_warnings" {
  log_group_name = aws_cloudwatch_log_group.dashboard-logs.name
  name           = "eventbus-dashboard-warnings-${var.environment}"
  pattern        = "{$.level = \"warning\"}"

  metric_transformation {
    name      = local.metric_names["dashboard"]["warning"]
    namespace = local.namespace
    value     = "1"
  }
}

resource "aws_cloudwatch_log_metric_filter" "dashboard_errors" {
  log_group_name = aws_cloudwatch_log_group.dashboard-logs.name
  name           = "eventbus-dashboard-errors-${var.environment}"
  pattern        = "{$.level = \"error\" || $.level = \"fatal\" || $.level = \"panic\"}"

  metric_transformation {
    name      = local.metric_names["dashboard"]["error"]
    namespace = local.namespace
    value     = "1"
  }
}

resource "aws_cloudwatch_metric_alarm" "controlplane_httpserver_warnings_alarm" {
  alarm_actions       = [module.pagerduty.pd_low_urgency_arn]
  alarm_name          = aws_cloudwatch_log_metric_filter.controlplane_httpserver_warnings.name
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = 3
  metric_name         = local.metric_names["controlplane_httpserver"]["warning"]
  namespace           = local.namespace
  ok_actions          = [module.pagerduty.pd_low_urgency_arn]
  period              = 60
  statistic           = "Sum"
  threshold           = local.max_warnings
}

resource "aws_cloudwatch_metric_alarm" "controlplane_httpserver_errors_alarm_low_urgency" {
  alarm_actions       = [module.pagerduty.pd_low_urgency_arn]
  alarm_name          = "${aws_cloudwatch_log_metric_filter.controlplane_httpserver_errors.name}-low-urgency"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = 3
  metric_name         = local.metric_names["controlplane_httpserver"]["error"]
  namespace           = local.namespace
  ok_actions          = [module.pagerduty.pd_low_urgency_arn]
  period              = 60
  statistic           = "Sum"
  threshold           = local.max_errors
}

resource "aws_cloudwatch_metric_alarm" "controlplane_httpserver_errors_alarm_high_urgency" {
  alarm_actions       = [module.pagerduty.pd_high_urgency_arn]
  alarm_name          = "${aws_cloudwatch_log_metric_filter.controlplane_httpserver_errors.name}-high-urgency"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = local.high_priority_period
  metric_name         = local.metric_names["controlplane_httpserver"]["error"]
  namespace           = local.namespace
  ok_actions          = [module.pagerduty.pd_high_urgency_arn]
  period              = 60
  statistic           = "Sum"
  threshold           = local.max_errors
  datapoints_to_alarm = local.high_priority_datapoints_to_alarm
  treat_missing_data  = "notBreaching"
}

resource "aws_cloudwatch_metric_alarm" "controlplane_converger_warnings_alarm" {
  alarm_actions       = [module.pagerduty.pd_low_urgency_arn]
  alarm_name          = aws_cloudwatch_log_metric_filter.controlplane_converger_warnings.name
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = 3
  metric_name         = local.metric_names["controlplane_converger"]["warning"]
  namespace           = local.namespace
  ok_actions          = [module.pagerduty.pd_low_urgency_arn]
  period              = 60
  statistic           = "Sum"
  threshold           = local.max_warnings
}

resource "aws_cloudwatch_metric_alarm" "controlplane_converger_errors_alarm_low_urgency" {
  alarm_actions       = [module.pagerduty.pd_low_urgency_arn]
  alarm_name          = "${aws_cloudwatch_log_metric_filter.controlplane_converger_errors.name}-low-urgency"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = 3
  metric_name         = local.metric_names["controlplane_converger"]["error"]
  namespace           = local.namespace
  ok_actions          = [module.pagerduty.pd_low_urgency_arn]
  period              = 60
  statistic           = "Sum"
  threshold           = local.max_errors
}

resource "aws_cloudwatch_metric_alarm" "controlplane_converger_errors_alarm_high_urgency" {
  alarm_actions       = [module.pagerduty.pd_high_urgency_arn]
  alarm_name          = "${aws_cloudwatch_log_metric_filter.controlplane_converger_errors.name}-high-urgency"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = local.high_priority_period
  metric_name         = local.metric_names["controlplane_converger"]["error"]
  namespace           = local.namespace
  ok_actions          = [module.pagerduty.pd_high_urgency_arn]
  period              = 60
  statistic           = "Sum"
  threshold           = local.max_errors
  datapoints_to_alarm = local.high_priority_datapoints_to_alarm
  treat_missing_data  = "notBreaching"
}

resource "aws_cloudwatch_metric_alarm" "dashboard_warnings_alarm" {
  alarm_actions       = [module.pagerduty.pd_low_urgency_arn]
  alarm_name          = aws_cloudwatch_log_metric_filter.dashboard_warnings.name
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = 3
  metric_name         = local.metric_names["dashboard"]["warning"]
  namespace           = local.namespace
  ok_actions          = [module.pagerduty.pd_low_urgency_arn]
  period              = 60
  statistic           = "Sum"
  threshold           = local.max_warnings
}

resource "aws_cloudwatch_metric_alarm" "dashboard_errors_alarm_low_urgency" {
  alarm_actions       = [module.pagerduty.pd_low_urgency_arn]
  alarm_name          = "${aws_cloudwatch_log_metric_filter.dashboard_errors.name}-low-urgency"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = 3
  metric_name         = local.metric_names["dashboard"]["error"]
  namespace           = local.namespace
  ok_actions          = [module.pagerduty.pd_low_urgency_arn]
  period              = 60
  statistic           = "Sum"
  threshold           = local.max_errors
}

resource "aws_cloudwatch_metric_alarm" "dashboard_errors_alarm_high_urgency" {
  alarm_actions       = [module.pagerduty.pd_high_urgency_arn]
  alarm_name          = "${aws_cloudwatch_log_metric_filter.dashboard_errors.name}-high-urgency"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = local.high_priority_period
  metric_name         = local.metric_names["dashboard"]["error"]
  namespace           = local.namespace
  ok_actions          = [module.pagerduty.pd_high_urgency_arn]
  period              = 60
  statistic           = "Sum"
  threshold           = local.max_errors
  datapoints_to_alarm = local.high_priority_datapoints_to_alarm
  treat_missing_data  = "notBreaching"
}

resource "aws_cloudwatch_metric_alarm" "controlplane_unhealthy_hosts" {
  alarm_actions       = [module.pagerduty.pd_high_urgency_arn]
  alarm_name          = "${var.cluster_name}-controlplane-unhealthy-hosts"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = 1
  metric_name         = "UnHealthyHostCount"
  namespace           = "AWS/ApplicationELB"
  ok_actions          = [module.pagerduty.pd_high_urgency_arn]
  period              = 60
  statistic           = "Sum"
  threshold           = local.max_unhealthy_hosts

  dimensions = {
    TargetGroup  = aws_alb_target_group.controlplane_httpserver.arn_suffix
    LoadBalancer = aws_alb.controlplane_httpserver.arn_suffix
  }
}

resource "aws_cloudwatch_metric_alarm" "dashboard_unhealthy_hosts" {
  alarm_actions       = [module.pagerduty.pd_high_urgency_arn]
  alarm_name          = "${var.cluster_name}-dashboard-unhealthy-hosts"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = 1
  metric_name         = "UnHealthyHostCount"
  namespace           = "AWS/ApplicationELB"
  ok_actions          = [module.pagerduty.pd_high_urgency_arn]
  period              = 60
  statistic           = "Sum"
  threshold           = local.max_unhealthy_hosts

  dimensions = {
    TargetGroup  = aws_alb_target_group.dashboard_tg.arn_suffix
    LoadBalancer = aws_alb.dashboard_alb.arn_suffix
  }
}

resource "aws_cloudwatch_metric_alarm" "p99" {
  alarm_actions       = [module.pagerduty.pd_low_urgency_arn]
  alarm_name          = "${var.cluster_name}-p99-latency"
  comparison_operator = "GreaterThanOrEqualToThreshold"
  evaluation_periods  = 3
  metric_name         = "TargetResponseTime"
  namespace           = "AWS/ApplicationELB"
  ok_actions          = [module.pagerduty.pd_low_urgency_arn]
  period              = 60
  extended_statistic  = "p99"
  threshold           = local.max_p99_latency
}

