#!/usr/bin/env python

from __future__ import print_function

import os
import re
import sys
import argparse

import requests

# this assumes that the script is stored in "sandbox/scripts/dev/clickhouse" directory to add "sandbox" to sys.path
sys.path.append(reduce(
    lambda path, _: os.path.dirname(path),
    xrange(5),
    os.path.abspath(__file__)
))


from infi.clickhouse_orm import database
from sandbox import common
from sandbox.services.modules.statistics_processor.schemas import clickhouse as ch_schemas

ENUM_RE = re.compile(r"'([\w\-.]+)' = (\d+)")
OLD_TABLE_SUFFIX = "__old"
ENDPOINT = "https://c.yandex-team.ru/api/tag2hosts"
CTAG = "sandbox_clickhouse"
DEFAULT_TABLES_TO_MIGRATE = "all"

colorizer = common.console.AnsiColorizer()


def parse_enum(enum_type):
    members = {}
    for match in ENUM_RE.finditer(enum_type):
        members[match.group(1)] = int(match.group(2))
    return members


def merge_columns(column_name, old_values, new_values, verbose):
    new_strings = [
        elem[0]
        for elem in old_values
        if elem[0] not in {_[0] for _ in new_values}
    ]
    if not new_strings:
        return []

    if verbose:
        print(
            colorizer.yellow("{} new value(s) for {}: {}".format(len(new_strings), column_name, new_strings)),
            file=sys.stderr
        )
    maxval = max(_[1] for _ in new_values) + 1
    return new_values + zip(new_strings, xrange(maxval, maxval + len(new_strings)))


def process(shard, schema, options, insert=False):
    connection = database.Database(
        options.database,
        "http://{}:8123".format(shard),
        readonly=not options.no_dry_run,
        username=options.username,
        password=options.password
    )

    convert_to_string = {}
    missing_in_old, missing_in_new = set(), set()

    new_table = "{}.{}".format(options.database, schema.underlying_table_name())
    old_table = new_table + OLD_TABLE_SUFFIX

    def get_columns(tn):
        return {
            _.name: _.to_dict()
            for _ in list(connection.select("DESCRIBE TABLE {}".format(tn)))
        }

    try:
        old_columns, new_columns = map(get_columns, (old_table, new_table))
    except Exception as exc:
        print(
            colorizer.red("Failed to get information about the table {}: {}.".format(new_table, str(exc).strip())),
            file=sys.stderr
        )
        return

    new_data = {}
    for column_name in set(old_columns.keys()) | set(new_columns.keys()):
        old_col, new_col = old_columns.get(column_name), new_columns.get(column_name)

        if None in (old_col, new_col):
            if new_col is None:
                missing_in_new.add(column_name)
            elif old_col is None:
                missing_in_old.add(column_name)
            continue

        if type(getattr(schema, column_name)).__name__ != "AutoEnumField":
            continue

        # old String -> new AutoEnum:
        #   alter new column to (new values) + (distinct old values missing in a new column)
        #   INSERT INTO TABLE ... SELECT * FROM ...
        #
        # old AutoEnum -> new AutoEnum:
        #   alter new column to (new values) + (old values missing in a new column)
        #   INSERT INTO TABLE ... SELECT ..., toString(enum_column), ... FROM ...

        def enum_values(column):
            return sorted(parse_enum(column["type"]).items(), key=lambda e: e[1])

        if old_col["type"].startswith("String"):
            if options.verbose:
                print(
                    colorizer.yellow("Reading old String values of {}.{}".format(old_table, column_name)),
                    file=sys.stderr
                )
            try:
                values = list(connection.select("SELECT DISTINCT {} FROM {}".format(column_name, old_table)))
                old_values = [
                    (getattr(_, column_name), None)
                    for _ in values
                ]
            except Exception as exc:
                print(
                    colorizer.red(
                        "Failed to read String values of {}.{}, skipping: {}".format(old_table, column_name, exc)
                    ),
                    file=sys.stderr
                )
                continue

        else:
            old_values = enum_values(old_col)
            convert_to_string[column_name] = True

        new_values = enum_values(new_col)
        result = merge_columns("{}.{}".format(new_table, column_name), old_values, new_values, options.verbose)
        if result:
            new_data[column_name] = result

    for name, content in new_data.items():
        for tn in (schema.underlying_table_name(), schema.table_name()):
            table = "{}.{}".format(options.database, tn)
            expr = "ALTER TABLE {} MODIFY COLUMN {} Enum16({});".format(
                table, name, ", ".join(("'{}' = {}".format(k, v) for k, v in sorted(content, key=lambda e: e[1])))
            )
            if options.verbose or not options.no_dry_run:
                print(colorizer.green(expr), file=sys.stderr)

            if options.no_dry_run:
                try:
                    connection.raw(expr)
                except Exception as exc:
                    print(colorizer.red("Failed to ALTER table {}: {}".format(table, exc)), file=sys.stderr)

    if not insert:
        return

    values = []
    for field_name, model in schema._fields:
        if field_name in missing_in_new:
            continue

        if convert_to_string.get(field_name, False):
            values.append("toString({})".format(field_name))
            continue

        if field_name in missing_in_old:
            values.append("{} as {}".format(model.to_db_string(model.default), field_name))
            continue

        values.append(field_name)

    expr = "INSERT INTO TABLE {new_table} SELECT {select_expr} FROM {old_table}".format(
        new_table=new_table, old_table=old_table, select_expr=", ".join(values)
    )
    print(colorizer.green(expr), file=sys.stderr)

    if options.no_dry_run:
        try:
            connection.raw(expr)
        except Exception as exc:
            print(colorizer.red("Failed to INSERT old data into table {}: {}".format(new_table, exc)), file=sys.stderr)


def parse_args():
    parser = argparse.ArgumentParser(
        formatter_class=lambda *args, **kwargs: argparse.ArgumentDefaultsHelpFormatter(*args, width=120, **kwargs),
        description="Migrate data from old ClickHouse tables with String/AutoEnum fields to new ones with AutoEnum"
    )

    parser.add_argument(
        "--username", type=str, action="store", default="sandbox",
        help="Username to access ClickHouse with"
    )
    parser.add_argument(
        "--password", type=str, action="store", default=None,
        help="Password to access ClickHouse with, if omitted, I'll read /home/zomb-sandbox/.clickhouse_password"
    )
    parser.add_argument(
        "--database", type=str, action="store", default="sandbox",
        help="Database name"
    )
    parser.add_argument(
        "--shards", nargs="+", required=True,
        help=(
            "FQDNs of shards. "
            "Warning: there must be NO replicas of the same shard in this list, otherwise data will duplicate!"
        )
    )
    parser.add_argument(
        "--tables", nargs="+", default=DEFAULT_TABLES_TO_MIGRATE,
        help="Tables to migrate"
    )
    parser.add_argument(
        "--no-dry-run", action="store_true",
        help="Execute the resulting queries instead of printing"
    )
    parser.add_argument(
        "--verbose", action="store_true",
        help="Additional verbosity"
    )
    parser.add_argument(
        "--please", action="store_true",
        help="Confirm that all outer systems which alter tables, such as StatisticsProcessor, are STOPPED"
    )
    return parser.parse_args()


def main(options):
    if options.no_dry_run and not options.please:
        print(
            colorizer.red(
                "Stop everything that may alter tables at the same time as this script runs "
                "and re-run this script with \"--please\" flag"
            ),
            file=sys.stderr
        )
        sys.exit(1)

    if not options.password:
        options.password = common.utils.read_settings_value_from_file("/home/zomb-sandbox/.clickhouse_password")

    replicas = requests.get("{}/{}".format(ENDPOINT, CTAG)).text.splitlines()
    schemas_to_process = (
        ch_schemas.SIGNAL_MODELS.values()
        if options.tables == DEFAULT_TABLES_TO_MIGRATE else
        map(ch_schemas.SIGNAL_MODELS.get, options.tables)
    )
    for replica in replicas:
        is_shard = replica in options.shards
        print(colorizer.blue("Processing replica {} (shard={})".format(replica, is_shard)), file=sys.stderr)
        for schema in schemas_to_process:
            print(colorizer.blue("Processing schema {}".format(schema.__name__)), file=sys.stderr)
            process(replica, schema, options, insert=is_shard)


if __name__ == "__main__":
    main(parse_args())
