#!/usr/bin/env ruby

require "aws-sdk"
require "elasticsearch"
require "elasticsearch/dsl"

# Script to verify data integrity between DynamoDB & ElasticSearch
class AwsDataChecker
  class DynamoDB
    TABLE_NAME = "history-staging" # Hardcoded for staging
    AWS_REGION = "us-west-2"
    LIMIT = 1000

    class << self
      # AWS DynamoDB client to perform queries on
      def client
        @client ||= Aws::DynamoDB::Client.new(region: AWS_REGION)
      end

      # Builds scan options to get items from DynamoDB
      def scan_options(exclusive_start_key)
        {
          table_name: TABLE_NAME,
          exclusive_start_key: exclusive_start_key,
          select: "SPECIFIC_ATTRIBUTES",
          attributes_to_get: ["uuid"],
          limit: LIMIT,
        }
      end

      # Returns an array of uuids with the last evaluated key to be used in subsequent queries
      def query(exclusive_start_key)
        result = client.scan scan_options(exclusive_start_key)
        [result.items.map { |item| item["uuid"] }, result.last_evaluated_key]
      end
    end
  end

  class ElasticSearch
    HOST = "https://search-history-staging-bwibrmz6cxzumsw5j46iwhbdxq.us-west-2.es.amazonaws.com/history/audits" # Staging endpoint
    SIZE = 1000

    class << self
      include Elasticsearch::DSL

      # ES client to perform queries on
      def client
        @client ||= Elasticsearch::Client.new(host: HOST)
      end

      # ES Query to fetch all audits. We're only interested in the total count
      def count_dsl
        search do
          query do
            match_all true
          end
        end.to_hash
      end

      # ES Query to fetch all audits matching given uuids
      def dsl(uuids)
        search do
          query do
            match_all true
          end
          filter do
            terms "uuid" => uuids
          end
          size SIZE
        end.to_hash
      end

      # Returns audit count from ES
      def audits_count
        result = client.search body: count_dsl
        result["hits"]["total"]
      end

      # Fetches audits matching the given uuids, and returns their uuids
      def fetch(uuids)
        result = client.search body: dsl(uuids)
        result["hits"]["hits"].map { |r| r["_source"]["uuid"] }
      end
    end
  end

  class << self
    # Ensure the counts from ES & DB match
    def compare_count(count_from_db, count_from_es)
      print "Comparing total count: DynamoDB(#{count_from_db}) with ES(#{count_from_es})... "

      if count_from_db == count_from_es
        puts "Looks good"
      else
        puts "Incorrect"
      end
    end

    # Ensure the uuids we got from ES & DB match
    def compare_uuids(audits_from_db, audits_from_es)
      print "Comparing audits: DynamoDB(#{audits_from_db.length}) with ES(#{audits_from_es.length})... "

      missing_audits = audits_from_db - audits_from_es

      if missing_audits.empty?
        puts "Looks good"
      else
        puts "Audits missing from ES: #{missing_audits.join(", ")}"
      end
    end

    # Main method to loop through the data & verify its integrity
    def verify_consistency
      exclusive_start_key = nil
      count_from_db = 0

      loop do
        uuids_from_db, exclusive_start_key = DynamoDB.query(exclusive_start_key)
        uuids_from_es = ElasticSearch.fetch(uuids_from_db)

        compare_uuids(uuids_from_db, uuids_from_es)

        count_from_db += uuids_from_db.length
        break if exclusive_start_key.nil?
      end

      count_from_es = ElasticSearch.audits_count

      compare_count(count_from_db, count_from_es)
    end
  end
end

AwsDataChecker.verify_consistency
