#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import sys
import os
import codecs
import argparse
import json
import datetime
import subprocess
from collections import defaultdict
import yaml
from nile.api.v1 import clusters
from yql.api.v1.client import YqlClient
from videolog_common import (
    apply_replacements,
    date_range,
    get_cluster,
    get_driver,
    get_stat_headers,
    get_dates_from_stat,
    get_date,
    StatPusher,
    YqlRunner,
)

TITLE = "[MMA-4376] COVID Internet Stats | YQL"


def _get_dates_set(yt, root, func=None):
    if func is None:
        func = lambda x: bool(get_date(x))
    return {
        get_date(x)
        for x in yt.search(root=root, node_type="table", path_filter=func)
    }


def _get_dates_to_process(co, input_roots, output_root):
    yt = co["yt"]
    print("input roots: {}".format(", ".join(input_roots)))
    tables_dates = [_get_dates_set(yt, root) for root in input_roots]
    available_dates = None
    for dates_set in tables_dates:
        if available_dates is None:
            available_dates = dates_set
        else:
            available_dates &= dates_set
    processed_dates = _get_dates_set(yt, output_root)
    last_processed_date = sorted(processed_dates)[-1]
    print("last processed date: {}".format(last_processed_date))
    last_available_date = sorted(available_dates)[-1]
    print("last available date: {}".format(last_available_date))
    if last_available_date > last_processed_date:
        dates = sorted(
            date_range(
                last_processed_date + datetime.timedelta(days=1),
                last_available_date,
            )
        )
        missing_dates = sorted([d for d in dates if d not in available_dates])
        if missing_dates:
            raise Exception(
                "logs missing for dates: {}".format(
                    ", ".join(map(str, missing_dates))
                )
            )
    dates_to_process = {d for d in available_dates if d > last_processed_date}
    return dates_to_process


def run_map_for_dates(co, dates):
    dates = sorted(map(str, dates))
    yr = co["yr"]
    config = co["config"]
    date_from = dates[0]
    date_to = dates[-1]
    print("running map for dates: {}".format(", ".join(dates)))
    with codecs.open(config["map_query_file"], "r", "utf8") as f:
        query = f.read()
    query = apply_replacements(
        query,
        {
            "@[pool]": co["args"]["pool"],
            "@[date_from]": date_from,
            "@[date_to]": date_to,
            "@[map_root]": co["map_root"],
        },
    )
    yr.run(
        query,
        wait=True,
        attachments=[{"path": x} for x in config["query_attachments"]],
    )


def run_research_and_prod_query(co, date):
    yr = co["yr"]
    config = co["config"]
    date_from = str(date - datetime.timedelta(days=7))
    date_to = str(date)
    print(
        "running research & prod query for dates: {}-{}".format(
            date_from, date_to
        )
    )
    with codecs.open(config["main_query_file"], "r", "utf8") as f:
        query = f.read()
    query = apply_replacements(
        query,
        {
            "@[pool]": co["args"]["pool"],
            "@[date_from]": date_from,
            "@[date_to]": date_to,
            "@[map_root]": co["map_root"],
            "@[json_root]": co["json_root"],
            "@[research_root]": co["research_root"],
            # "@[formula]": config["formula"],
        },
    )
    yr.run(
        query,
        wait=True,
        attachments=[{"path": x} for x in config["query_attachments"]],
    )


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--pool", default="robot-mma-nirvana")
    parser.add_argument("--from")
    parser.add_argument("--to")
    parser.add_argument("--actions", "-a", default="")
    parser.add_argument("--use_ready_map", action="store_true")
    parser.add_argument("--config", "-c")
    args = vars(parser.parse_args())

    with open(args["config"]) as f:
        config = yaml.safe_load(f)
    args["actions"] = args["actions"].split(",")

    from_ = args.get("from")
    to_ = args.get("to")

    cluster = get_cluster(clusters, args)
    yt = get_driver(cluster).client
    proxy = os.environ["YT_PROXY"]
    yc = YqlClient(db=proxy.lower(), token=os.environ["YQL_TOKEN"])
    yr = YqlRunner(client=yc, title=TITLE)
    common_objects = {
        "yt": yt,
        "yc": yc,
        "yr": yr,
        "cluster": cluster,
        "config": config,
        "args": args,
    }
    co = common_objects

    map_root = "{}/map".format(config["output_root"])
    json_root = "{}/json_data".format(config["output_root"])
    research_root = "{}/research".format(config["output_root"])
    co["map_root"] = map_root
    co["json_root"] = json_root
    co["research_root"] = research_root
    map_input_roots = [
        "//logs/crypta-rt-geo-log/1d",
        "//logs/yandex-access-log/1d",
        "//logs/browser-metrika-mobile-log/1d",
    ]

    if from_ and to_:
        dates = date_range(from_, to_)
        if "map" in args["actions"]:
            run_map_for_dates(co, dates)
        if "aggregation" in args["actions"]:
            run_research_and_prod_query(co, sorted(dates)[-1])
    else:
        dates_for_map = _get_dates_to_process(co, map_input_roots, map_root)
        if dates_for_map:
            run_map_for_dates(co, dates_for_map)
        else:
            print("no dates to run map for, skipping")
        dates_for_research_and_prod = _get_dates_to_process(
            co, [map_root], json_root
        )
        if dates_for_research_and_prod:
            run_research_and_prod_query(
                co, sorted(dates_for_research_and_prod)[-1]
            )


if __name__ == "__main__":
    main()
