# home/runtimecloud/research/KERNEL-691/cpuusage_per_hour
# leonidkiselev
from datetime import date as DATE, datetime, timedelta
import nirvana.job_context as nv
from yql.api.v1.client import YqlClient
import yt.wrapper as yt


def create_table(table_path):
    dst_table = yt.TablePath(table_path, append=False)

    yt.create(
        "table",
        dst_table,
        ignore_existing=True,
        recursive=True,
        attributes={"dynamic": False}
    )


def write_table(table_path, table):
    dst_table = yt.TablePath(table_path, append=False)

    yt.write_table(
        dst_table,
        table,
    )


def get_intervals(date):
    year, month, day = int(date[:4]), int(date[5:7]), int(date[8:])
    intervals = [[]] * 24
    for i in range(24):
        begin = datetime(year, month, day, i) - timedelta(hours=3)
        end = begin + timedelta(hours=1)
        intervals[i] = [begin, end]
    return intervals


def split_chunks(lst, chunk_size):
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i + chunk_size]


def get_chunks(client, date, chunk_size):
    query = '''
    USE hahn;
    pragma yt.InferSchema;
    pragma SimpleColumns;
    PRAGMA yson.DisableStrict;
    PRAGMA AnsiInForEmptyOrNullableItemsCollections;
    pragma yt.Pool = 'runtimecloud';
    SELECT DISTINCT `fqdn`, `cgrp`
    FROM hahn.`//home/runtimecloud/research/KERNEL-691/cpuusage_per_hour/{0}-tmp/Short`
    '''.format(date)

    request = client.query(
        query,
        syntax_version=1
    )
    request.run()

    for table in request.get_results():
        table.fetch_full_data()
        fqdn_and_cgrp = [(row[0], row[1]) for row in table.rows]
        return list(split_chunks(fqdn_and_cgrp, chunk_size))


def get_avg():
    query = '''
    from yql.typing import *
    from datetime import date as DATE, datetime, timedelta
    def get_intervals(date):
        year, month, day = int(date[:4]), int(date[5:7]), int(date[8:])
        intervals = [[]] * 24
        for i in range(24):
            begin = datetime(year, month, day, i) - timedelta(hours=3)
            end = begin + timedelta(hours=1)
            intervals[i] = [begin, end]
        return intervals
    def get_avg(
        fqdn:Optional[String],
        cgrp:Optional[String],
        ctname:Optional[String],
        ts_and_cpuusage:Optional[List[String]]
        ) -> Optional[Dict[Tuple[Int64, String, String, String], Float]]:
        Unix = datetime(1970, 1, 1)
        service = (fqdn, cgrp, ctname)
        Date = ""
        data = []
        for pair in ts_and_cpuusage:
            pair = str(pair)
            Lpos = pair.find(' ')
            Rpos = pair.rfind(' ')
            ts, date, cpuusage = int(str(pair[2:Lpos])), str(pair[Lpos + 1:Rpos]), int(str(pair[Rpos + 1:-1]))
            data.append((ts, cpuusage))
            Date = date
        intervals = get_intervals(Date)
        cpuusage_per_hours = [None] * 24
        for pair in data:
            ts = pair[0]
            time = Unix + timedelta(seconds=ts)
            for i in range(24):
                if intervals[i][0] <= time < intervals[i][1]:
                    if not cpuusage_per_hours[i]:
                        cpuusage_per_hours[i] = []
                    cpuusage_per_hours[i].append(pair)
                    break
        cpuusage_avgs = [0.0] * 24
        for i in range(24):
            if not cpuusage_per_hours[i] or len(cpuusage_per_hours[i]) <= 1:
                continue
            cpuusage_per_hours[i].sort()
            sum, count = 0, 0
            for j in range(1, len(cpuusage_per_hours[i])):
                cpuusage = cpuusage_per_hours[i][j][1]
                cpuusage_prev = cpuusage_per_hours[i][j - 1][1]
                if cpuusage <= cpuusage_prev:
                    continue
                sum += cpuusage - cpuusage_prev
                count += 1
            if not count:
                continue
            avg = sum / count
            cpuusage_avgs[i] = avg
        table = {}
        for i in range(24):
            _ts = int((intervals[i][0] - Unix).total_seconds())
            _avg_cpuusage = cpuusage_avgs[i]
            _fqdn = service[0]
            _cgrp = service[1]
            _ctname = service[2]
            table[(_ts, _fqdn, _cgrp, _ctname)] = _avg_cpuusage
        return table'''

    string = query.split('\n')
    query = ""
    for line in string:
        query += line[4:] + '\n'
    return query


def create_tables(client, date, services, chunk):
    _in = "".join(['''"{0} & {1}", '''.format(service[0], service[1]) for service in services[chunk]])[:-2]

    query = '''
    USE hahn;
    pragma yt.InferSchema;
    pragma SimpleColumns;
    PRAGMA yson.DisableStrict;
    PRAGMA AnsiInForEmptyOrNullableItemsCollections;
    pragma yt.Pool = 'runtimecloud';
    $Chunk = "home/runtimecloud/research/KERNEL-691/cpuusage_per_hour/{0}-tmp/chunks/{1}";
    INSERT INTO $Chunk WITH TRUNCATE SELECT `ts`, `cpuusage`, `fqdn`, `cgrp`, `ctname` FROM hahn.`home/runtimecloud/research/KERNEL-691/cpuusage_per_hour/{0}-tmp/Short`
    WHERE `fqdn` || " & " || `cgrp` IN ({2})'''.format(date, str(chunk + 1), _in)

    request = client.query(
        query,
        syntax_version=1
    )
    request.run()
    request.get_results()

    query = '''
    USE hahn;
    pragma yt.InferSchema;
    pragma SimpleColumns;
    PRAGMA yson.DisableStrict;
    PRAGMA AnsiInForEmptyOrNullableItemsCollections;
    pragma yt.Pool = 'runtimecloud';
    $Chunk = "home/runtimecloud/research/KERNEL-691/cpuusage_per_hour/{0}-tmp/chunks-services/{1}";
    INSERT INTO $Chunk WITH TRUNCATE
    SELECT ListSort(AGG_LIST(`ts` || " " || CAST(`cpuusage` AS String))) AS `ts_and_cpuusage`, `fqdn`, `cgrp`, `ctname`
    FROM hahn.`home/runtimecloud/research/KERNEL-691/cpuusage_per_hour/{0}-tmp/chunks/{1}`
    GROUP BY `fqdn`, `cgrp`, `ctname`
    '''.format(date, str(chunk + 1))

    request = client.query(
        query,
        syntax_version=1
    )
    request.run()
    request.get_results()

    avg_foo = get_avg()
    query = '''USE hahn; pragma yt.InferSchema; pragma SimpleColumns; PRAGMA yson.DisableStrict; PRAGMA AnsiInForEmptyOrNullableItemsCollections; pragma yt.Pool = 'runtimecloud'; $script = @@{0}@@; $get_avg = Python3::get_avg($script); $Chunk = "home/runtimecloud/research/KERNEL-691/cpuusage_per_hour/{1}-tmp/chunks-pre-final/{2}"; INSERT INTO $Chunk WITH TRUNCATE SELECT $get_avg(`fqdn`, `cgrp`, `ctname`, `ts_and_cpuusage`) AS `avg_cpuusage` FROM hahn.`//home/runtimecloud/research/KERNEL-691/cpuusage_per_hour/{1}-tmp/chunks-services/{2}`'''.format(avg_foo, date, str(chunk + 1))

    request = client.query(
        query,
        syntax_version=1
    )
    request.run()
    request.get_results()


def finalize_table(date, chunk):
    table = []

    for service in yt.read_table("//home/runtimecloud/research/KERNEL-691/cpuusage_per_hour/{0}-tmp/chunks-pre-final/{1}".format(date, str(chunk + 1))):
        statistics = service["avg_cpuusage"]
        ts_and_avg_cpuusage = {}
        fqdn, cgrp, ctname = None, None, None
        for record in statistics:
            ts, fqdn, cgrp, ctname, avg_cpuusage = record[0][0], record[0][1], record[0][2], record[0][3], record[1]
            ts_and_avg_cpuusage[ts] = avg_cpuusage
        for ts in sorted(ts_and_avg_cpuusage):
            table.append({
                "fqdn": fqdn,
                "cgrp": cgrp,
                "ctname": ctname,
                "ts": ts,
                "cpuusage": round(ts_and_avg_cpuusage[ts] / 60000000000, 4)
            })

    final_path = "//home/runtimecloud/research/KERNEL-691/cpuusage_per_hour/{0}-tmp/chunks-final/{1}".format(date, str(chunk + 1))
    create_table(final_path)
    write_table(final_path, table)


def union_all(client, date, chunks):
    query = '''USE hahn; pragma yt.InferSchema; pragma SimpleColumns; PRAGMA yson.DisableStrict; PRAGMA AnsiInForEmptyOrNullableItemsCollections; pragma yt.Pool = 'runtimecloud';'''

    for chunk in range(chunks):
        table = ''' $Half_{1} = "home/runtimecloud/research/KERNEL-691/cpuusage_per_hour/{0}-tmp/chunks-final/{1}";'''.format(date, str(chunk + 1))
        query += table
    query += ''' INSERT INTO hahn.`home/runtimecloud/research/KERNEL-691/cpuusage_per_hour/{0}` WITH TRUNCATE '''.format(date)

    for chunk in range(chunks):
        select = '''SELECT T_{0}.`fqdn` AS `fqdn`, T_{0}.`cgrp` AS `cgrp`, T_{0}.`ctname` AS `ctname`, T_{0}.`ts` AS `ts`, T_{0}.`cpuusage` AS `cpuusage` FROM $Half_{0} AS T_{0} UNION ALL '''.format(str(chunk + 1))
        query += select
    query = query[:-10]

    request = client.query(
        query,
        syntax_version=1
    )
    request.run()
    request.get_results()


def main():
    yt.config.set_proxy("hahn")
    client = YqlClient(db="hahn")

    parameters = nv.context().get_parameters()
    chunk_size = parameters.get("chunk_size")
    Date = parameters.get("date")
    delete = parameters.get("delete_tmp")

    date = str(DATE.today() - timedelta(days=1))
    if Date is not None:
        date = Date

    services = get_chunks(client, date, chunk_size)
    for chunk in range(len(services)):
        create_tables(client, date, services, chunk)
        finalize_table(date, chunk)
    union_all(client, date, len(services))

    if delete:
        yt.remove(path="//home/runtimecloud/research/KERNEL-691/cpuusage_per_hour/{0}-tmp".format(date), recursive=True)


if __name__ == "__main__":
    main()

