# Creates an ElasticSearch Cluster for Browser Grid Logging
# Includes a Proxy Instance to allow for connections to Kibana from Dev Computer

// List of ARNs that are allowed access to elasticsearch, for publishing logs from other accounts
variable "allowed_elasticsearch_access_arns" {
  default = [
    "arn:aws:iam::425992774280:role/cloudwatch_elasticsearch_shipper_LHxOp8yw", // twitch-cape-qe-aws
    "arn:aws:iam::148380705220:role/cloudwatch_elasticsearch_shipper_L3_U8pfw", // twitch-browsergrid-twilight-prod
    "arn:aws:iam::100860887800:role/cloudwatch_elasticsearch_shipper_liu-PfmX", // twitch-browsergrid-beacon-prod
    "arn:aws:iam::147030575244:role/cloudwatch_elasticsearch_shipper_f3y_sP32", // twitch-browsergrid-prod
  ]
}

variable "vpc_id" {
  default = "vpc-01ff69ff4dfdb09f3" // twitch-browsergrid-prod
}

variable "subnet_id" {
  default = "subnet-03730d8d1b4da507c" // Public A
}

data "aws_region" "current" {}

data "aws_caller_identity" "current" {}

data "aws_subnet" "instance_subnet" {
  id = "${var.subnet_id}"
}

data "aws_vpc" "selected" {
  id = "${var.vpc_id}"
}

variable "domain" {
  default = "browsergrid-central-logging"
}

#############
# EC2 Proxy #
#############

# Reverse Proxy to allow connection to the ElasticSearch Stack
# Can then use Systems Manager & Port Forwarding to access ES on Localhost
# Temporary until better authentication methods are defined

resource "aws_iam_role" "es_proxy" {
  name = "kibana_es_proxy"
  assume_role_policy = <<EOF
{
  "Version": "2012-10-17",
  "Statement": [
    {
      "Action": "sts:AssumeRole",
      "Principal": {
        "Service": "ec2.amazonaws.com"
      },
      "Effect": "Allow",
      "Sid": ""
    }
  ]
}
EOF
}

resource "aws_iam_instance_profile" "es_proxy" {
  name = "kibana_es_proxy"
  role = "${aws_iam_role.es_proxy.name}"
}

// Allow connection for Systems Manager
data "aws_iam_policy" "AmazonSSMManagedInstanceCore" {
  arn = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
}

resource "aws_iam_role_policy_attachment" "es_proxy_ssm" {
  role       = "${aws_iam_role.es_proxy.name}"
  policy_arn = "${data.aws_iam_policy.AmazonSSMManagedInstanceCore.arn}"
}

resource "aws_instance" "es_proxy" {
  ami = "ami-09c5e030f74651050"
  instance_type = "t2.micro"
  subnet_id = "${data.aws_subnet.instance_subnet.id}"
  iam_instance_profile = "${aws_iam_instance_profile.es_proxy.id}"
  key_name = "kibana_proxy" // Required for Port Forwarding... Sigh.
  associate_public_ip_address = true

  root_block_device {
    volume_size = 8
  }

  tags = {
    Name  = "kibana-proxy"
    Owner = "reichsta"
  }
}

resource "aws_cloudwatch_metric_alarm" "es_proxy_statuscheckfailed" {
  alarm_name           = "${aws_instance.es_proxy.tags["Name"]}-${aws_instance.es_proxy.id}-StatusCheckFailed"
  metric_name          = "StatusCheckFailed"
  alarm_description    = "Reports whether the instance has passed both the instance status check and the system status check"
  namespace            = "AWS/EC2"
  evaluation_periods   = 5
  period               = 60
  statistic            = "Average"
  comparison_operator  = "GreaterThanOrEqualToThreshold"
  threshold            = 1 // 0 (passed) or 1 (failed)
  alarm_actions        = ["${aws_sns_topic.alerts.arn}"]
  ok_actions           = ["${aws_sns_topic.alerts.arn}"]

  dimensions {
    InstanceId = "${aws_instance.es_proxy.id}"
  }
}

#######################################
# ElasticSearch Service Configuration #
#######################################

// Required for ES Running in a VPC
resource "aws_iam_service_linked_role" "es" {
  aws_service_name = "es.amazonaws.com"
}

resource "aws_elasticsearch_domain" "es_logging_stack" {
  domain_name           = "${var.domain}"
  elasticsearch_version = "7.1"

  cluster_config {
    instance_type = "m5.large.elasticsearch"
    instance_count = 2

    // Multi-AZ
    zone_awareness_enabled = true
    zone_awareness_config {
      availability_zone_count = 2
    }
  }

  ebs_options {
    ebs_enabled = true
    volume_type = "gp2"
    volume_size = 35
  }

  snapshot_options {
    automated_snapshot_start_hour = 0
  }

  encrypt_at_rest {
    enabled = true
  }

  node_to_node_encryption {
    enabled = true
  }

  // Allow all ES Actions (get/post/delete) for the EC2 Proxy Instance
  access_policies = <<CONFIG
{
  "Version": "2012-10-17",
  "Statement": [
    {
      "Effect": "Allow",
      "Principal": {
        "AWS": "*"
      },
      "Action": "es:*",
      "Resource": "arn:aws:es:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:domain/${var.domain}/*",
      "Condition": {
        "IpAddress": {
          "aws:SourceIp": "${aws_instance.es_proxy.public_ip}/32"
        }
      }
    },
    {
      "Effect": "Allow",
      "Sid": "Allow retention cleaner",
      "Principal": {
        "AWS": "${aws_iam_role.es_cleaner_lambda.arn}"
      },
      "Action": [
        "es:*"
      ],
      "Resource": "arn:aws:es:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:domain/${var.domain}/*"
    },
    {
      "Effect": "Allow",
      "Sid": "Allow other accounts to publish logs",
      "Principal": {
        "AWS": ${jsonencode(var.allowed_elasticsearch_access_arns)}
      },
      "Action": [
        "es:*"
      ],
      "Resource": "arn:aws:es:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:domain/${var.domain}/*"
    }
  ]
}
  CONFIG

  tags = {
    Service = "Grid"
  }

  depends_on = [
    "aws_iam_service_linked_role.es",
  ]
}

############################
# CloudWatch Alerts for ES #
############################

resource "aws_cloudwatch_metric_alarm" "cluster_status_yellow" {
  alarm_name           = "${aws_elasticsearch_domain.es_logging_stack.domain_name}-ClusterStatus-yellow"
  metric_name          = "ClusterStatus.yellow"
  alarm_description    = "Indicates that the primary shards for all indices are allocated to nodes in a cluster, but the replica shards for at least one index are not"
  namespace            = "AWS/ES"
  evaluation_periods   = 1
  period               = 300
  statistic            = "Maximum"
  comparison_operator  = "GreaterThanOrEqualToThreshold"
  threshold            = 1
  alarm_actions        = ["${aws_sns_topic.alerts.arn}"]
  ok_actions           = ["${aws_sns_topic.alerts.arn}"]

  dimensions {
    DomainName = "${aws_elasticsearch_domain.es_logging_stack.domain_name}"
    ClientId   = "${data.aws_caller_identity.current.account_id}"
  }
}

resource "aws_cloudwatch_metric_alarm" "cluster_status_red" {
  alarm_name           = "${aws_elasticsearch_domain.es_logging_stack.domain_name}-ClusterStatus-red"
  metric_name          = "ClusterStatus.red"
  alarm_description    = "Indicates that the primary and replica shards of at least one index are not allocated to nodes in a cluster"
  namespace            = "AWS/ES"
  evaluation_periods   = 1
  period               = 300
  statistic            = "Maximum"
  comparison_operator  = "GreaterThanOrEqualToThreshold"
  threshold            = 1
  alarm_actions        = ["${aws_sns_topic.alerts.arn}"]
  ok_actions           = ["${aws_sns_topic.alerts.arn}"]

  dimensions {
    DomainName = "${aws_elasticsearch_domain.es_logging_stack.domain_name}"
    ClientId   = "${data.aws_caller_identity.current.account_id}"
  }
}

resource "aws_cloudwatch_metric_alarm" "cluster_cpu" {
  alarm_name           = "${aws_elasticsearch_domain.es_logging_stack.domain_name}-CPUUtilization"
  metric_name          = "CPUUtilization"
  alarm_description    = "The maximum percentage of CPU resources used for data nodes in the cluster"
  namespace            = "AWS/ES"
  evaluation_periods   = 1
  period               = 300
  statistic            = "Average"
  comparison_operator  = "GreaterThanOrEqualToThreshold"
  threshold            = 80
  alarm_actions        = ["${aws_sns_topic.alerts.arn}"]
  ok_actions           = ["${aws_sns_topic.alerts.arn}"]

  dimensions {
    DomainName = "${aws_elasticsearch_domain.es_logging_stack.domain_name}"
    ClientId   = "${data.aws_caller_identity.current.account_id}"
  }
}

resource "aws_cloudwatch_metric_alarm" "cluster_free_storage" {
  alarm_name           = "${aws_elasticsearch_domain.es_logging_stack.domain_name}-FreeStorageSpace"
  metric_name          = "FreeStorageSpace"
  alarm_description    = "The free space for nodes in the cluster"
  namespace            = "AWS/ES"
  evaluation_periods   = 1
  period               = 300
  statistic            = "Sum" // Sum shows total free space for the cluster
  comparison_operator  = "LessThanOrEqualToThreshold"
  threshold            = 500 // in MB
  alarm_actions        = ["${aws_sns_topic.alerts.arn}"]
  ok_actions           = ["${aws_sns_topic.alerts.arn}"]

  dimensions {
    DomainName = "${aws_elasticsearch_domain.es_logging_stack.domain_name}"
    ClientId   = "${data.aws_caller_identity.current.account_id}"
  }
}

resource "aws_cloudwatch_metric_alarm" "cluster_index_writes_blocked" {
  alarm_name           = "${aws_elasticsearch_domain.es_logging_stack.domain_name}-ClusterIndexWritesBlocked"
  metric_name          = "ClusterIndexWritesBlocked"
  alarm_description    = "Indicates whether your cluster is accepting or blocking incoming write requests"
  namespace            = "AWS/ES"
  evaluation_periods   = 1
  period               = 300
  statistic            = "Maximum"
  comparison_operator  = "GreaterThanOrEqualToThreshold"
  threshold            = 1 // 0 means that the cluster is accepting requests. 1 means that it is blocking requests
  alarm_actions        = ["${aws_sns_topic.alerts.arn}"]
  ok_actions           = ["${aws_sns_topic.alerts.arn}"]

  dimensions {
    DomainName = "${aws_elasticsearch_domain.es_logging_stack.domain_name}"
    ClientId   = "${data.aws_caller_identity.current.account_id}"
  }
}

resource "aws_cloudwatch_metric_alarm" "cluster_kibana_healthy_nodes" {
  alarm_name           = "${aws_elasticsearch_domain.es_logging_stack.domain_name}-KibanaHealthyNodes"
  metric_name          = "KibanaHealthyNodes"
  alarm_description    = "A health check for Kibana. A value of 1 indicates normal behavior. A value of 0 indicates that Kibana is inaccessible."
  namespace            = "AWS/ES"
  evaluation_periods   = 1
  period               = 300
  statistic            = "Maximum"
  comparison_operator  = "LessThanOrEqualToThreshold"
  threshold            = 0 // 0 indicates that Kibana is inaccessible. 1 indicates normal behavior
  alarm_actions        = ["${aws_sns_topic.alerts.arn}"]
  ok_actions           = ["${aws_sns_topic.alerts.arn}"]

  dimensions {
    DomainName = "${aws_elasticsearch_domain.es_logging_stack.domain_name}"
    ClientId   = "${data.aws_caller_identity.current.account_id}"
  }
}

resource "aws_cloudwatch_metric_alarm" "cluster_5xx" {
  alarm_name           = "${aws_elasticsearch_domain.es_logging_stack.domain_name}-5xx"
  metric_name          = "5xx"
  alarm_description    = "The number of requests to the domain that resulted in the given 5xx"
  namespace            = "AWS/ES"
  evaluation_periods   = 1
  period               = 300
  statistic            = "Maximum"
  comparison_operator  = "GreaterThanOrEqualToThreshold"
  threshold            = 50
  alarm_actions        = ["${aws_sns_topic.alerts.arn}"]
  ok_actions           = ["${aws_sns_topic.alerts.arn}"]

  dimensions {
    DomainName = "${aws_elasticsearch_domain.es_logging_stack.domain_name}"
    ClientId   = "${data.aws_caller_identity.current.account_id}"
  }
}

output "elasticsearch_domain_endpoint" {
  value = "${aws_elasticsearch_domain.es_logging_stack.endpoint}"
}

output "elasticsearch_domain_arn" {
  value = "${aws_elasticsearch_domain.es_logging_stack.arn}"
}
