#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import argparse
import codecs
import datetime
import getpass
import json
import logging
import os
import re
import sys
import time
import urllib
try:
    import urllib.parse as urlparse
except ImportError:
    import urlparse
import smtplib
from email.mime.text import MIMEText
import time
from collections import defaultdict, Counter
import requests

try:
    basestring
except NameError:
    basestring = str

try:
    from nile.api.v1 import statface as ns
except ImportError:
    sys.stderr.write("no nile installed, StatPusher will be unavailable")

try:
    from yql.client.operation import YqlOperationType
except ImportError:
    YqlOperationType = None


SPEC10k = {
    "job_count": 10000,
    "map_job_count": 10000,
    "reduce_job_count": 10000,
    "pool": "search-research_{}".format(getpass.getuser()),
}
re_date = re.compile(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")


ref_from_whitelist = [
    "appsearch",
    "browser",
    "efir",
    "efir_touch",
    "feed",
    "geoadv",
    "kp",
    "leagueoflegends.com",
    "morda",
    "morda_touch",
    "multiple",
    "ny2018",
    "ottwidget_kp",
    "ottwidget_morda",
    "ottwidget_tv",
    "ottwidget_ya-serp",
    "ottwidget_ya-video",
    "ottwidget_yavideo",
    "ott-smart-webos",
    "ott-smart-tizen",
    "ott-smart-samsung",
    "partner",
    "ru.yandex.quasar.app",
    "serp",
    "streamhandler_appsearch",
    "streamhandler_other",
    "streamhandler_serp",
    "streamhandler_serp_touch",
    "streamhandler_tv",
    "streamhandler_yabrowser",
    "streamhandler_yxnews",
    "turbo",
    "tv",
    "unknown",
    "videohub",
    "videohub_touch",
    "weather_desktop",
    "weather_turbo",
    "ya-market",
    "ya-music",
    "ya-tv",
    "ya-tv-in-page",
    "ya-tv-program",
    "ya-weather",
    "yanews",
    "yanewsautoplay",
    "yanewstragic",
    "yatvapp",
    "yavideo",
]


zen_ref_from_treatment = """
$ref_from_whitelist = AsList(%wl);

$ref_from_preprocess = ($x) -> {
    RETURN CASE
    WHEN $x IS NULL THEN "other"
    WHEN ListHas($ref_from_whitelist, $x ?? "") THEN $x
    WHEN $x LIKE "zen_site_mobile%" THEN "zen_site_mobile"
    WHEN $x LIKE "zen_site_desktop%" THEN "zen_site_desktop"
    WHEN $x LIKE "zen%" THEN "zen"
    WHEN $x LIKE "streamhandler%" THEN "streamhandler_other"
    ELSE "other"
    END;
};
""".replace(
    "%wl", ", ".join('"{}"'.format(x) for x in ref_from_whitelist)
)

zen_ref_from_treatment_light = """
$ref_from_whitelist = AsList(%wl);

$re_nondigits = Re2::Capture("[^0-9]");
$re_nonascii = Re2::Capture("[^a-z0-9A-Z_.-]");

$ref_from_preprocess = ($x) -> {
    RETURN CASE
    WHEN $x IS NULL THEN "other"
    WHEN ListHas($ref_from_whitelist, $x ?? "") THEN $x
    WHEN $x LIKE "zen_site_mobile%" THEN "zen_site_mobile"
    WHEN $x LIKE "zen_site_desktop%" THEN "zen_site_desktop"
    WHEN $x LIKE "zen%" THEN "zen"
    WHEN $x LIKE "streamhandler%" THEN "streamhandler_other"
    WHEN $re_nonascii($x)._0 IS NOT NULL THEN "other"
    WHEN Length($x) > 30 THEN "other"
    WHEN $re_nondigits($x)._0 IS NULL THEN "other"
    ELSE $x
    END;
};
""".replace(
    "%wl", ", ".join('"{}"'.format(x) for x in ref_from_whitelist)
)


def ref_from_treatment(ref_from, treatment="harsh"):
    if not ref_from:
        return "other"
    if ref_from in set(ref_from_whitelist):
        return ref_from
    if ref_from.startswith(("zen_site_mobile", "zen/site_mobile")):
        return "zen_site_mobile"
    if ref_from.startswith(("zen_site_desktop", "zen/site_desktop")):
        return "zen_site_desktop"
    if ref_from.startswith("zen"):
        return "zen"
    if ref_from.startswith("streamhandler"):
        return "streamhandler"
    if treatment == "harsh":
        return "other"
    if re.search("[^a-z0-9A-Z_.-]", ref_from):
        return "other"
    if len(ref_from) > 30:
        return "other"
    if not re.search("[^0-9]", ref_from):
        return "other"
    return ref_from


def wrap_ref_from(ref_from, treatment="harsh"):
    ref_from = ref_from_treatment(ref_from, treatment=treatment)
    if ref_from.startswith("zen"):
        return (ref_from, "_zen_", "_total_")
    if ref_from in {
        "morda_touch",
        "videohub_touch",
        "efir_touch",
        "efir_turboapp",
        "streamhandler_appsearch",
    }:
        return (ref_from, "Эфир-тач", "_total_without_zen_", "_total_")
    if ref_from in {"morda", "videohub", "efir", "streamhandler_other"}:
        return (ref_from, "Эфир", "_total_without_zen_", "_total_")
    return (ref_from, "_total_without_zen_", "_total_")


def tryint(string):
    try:
        return int(string)
    except:
        return -1


def apply_replacements(s, pairs):
    if isinstance(pairs, dict):
        pairs = pairs.items()
    for p in pairs:
        try:
            s = s.replace(p[0], p[1])
        except:
            import pdb

            pdb.set_trace()
    return s


def optionalize_schema(qt, schema):
    return {k: qt.Optional[v] for k, v in schema.items()}


def arcanum_get_headers():
    return {"Authorization": "OAuth {}".format(os.environ["ARCANUM_TOKEN"])}


def arcadia_normalize_path(path):
    if "arcadia.yandex.ru/arc/" in path:
        path = path.split("arcadia.yandex.ru/arc/")[-1]
    if "arcadia/arc/" in path:
        path = path.split("arcadia/arc/")[-1]
    if path.startswith("arc/"):
        path = path.split("arc/")[-1]
    if path.startswith(("arcadia", "arcadia_tests_data")):
        path = "trunk/" + path
    if not path.startswith("arcadia") and not path.startswith("trunk"):
        path = "trunk/arcadia/" + path
    return path


def arcanum_get_metadata(path, headers=None, normalize=True):
    if not headers:
        headers = arcanum_get_headers()
    if normalize:
        path = arcadia_normalize_path(path)
    metadata = requests.get(
        "https://a.yandex-team.ru/api/tree/node/{}".format(path),
        verify=False,
        headers=headers,
    ).json()
    return metadata


class YqlRunner(object):
    RUN_DEFAULT_ARGS = {
        "wait": True,
        "with_prefix": True,
        "syntax_version": 1,
        "title": None,
        "query_type": None,
    }

    @staticmethod
    def arc_strip_prefix(path):
        bad_prefix = "trunk/arcadia/"
        stripped = path[len(bad_prefix) :]
        return stripped

    def get_arcanum_url(self, arcadia_path):
        normalized = arcadia_normalize_path(arcadia_path)
        metadata = arcanum_get_metadata(normalized, normalize=False)
        lcr = metadata["lastChangedRev"]
        stripped = self.arc_strip_prefix(normalized)
        result = "arc://{}?rev={}".format(stripped, lcr)
        return result

    def __init__(
        self, client, title="YqlRunner | YQL", prefix="", verbose=True
    ):
        self.client = client
        self.title = title
        self.prefix = prefix
        self.verbose = verbose

    def run(self, query, **kwargs):
        for kwarg in self.RUN_DEFAULT_ARGS:
            if kwarg not in kwargs:
                kwargs[kwarg] = self.RUN_DEFAULT_ARGS[kwarg]
        if self.prefix and kwargs["with_prefix"]:
            query = self.prefix + query
        title = kwargs["title"] or self.title
        query_kwargs = {"title": title}
        query_kwargs["syntax_version"] = kwargs["syntax_version"]
        req = self.client.query(query, **query_kwargs)
        if kwargs["query_type"]:
            req.type = getattr(YqlOperationType, kwargs["query_type"])
        if "attachments" in kwargs:
            for attachment in kwargs["attachments"]:
                if isinstance(attachment, basestring):
                    req.attach_file(
                        attachment, alias=os.path.basename(attachment)
                    )
                elif isinstance(attachment, dict):
                    if "rev" in attachment:
                        assert "path" in attachment
                        normalized = self.arc_strip_prefix(
                            arcadia_normalize_path(attachment["path"])
                        )
                        url = "arc://{}?rev={}".format(
                            normalized, attachment["rev"]
                        )
                    elif "url" in attachment:
                        url = attachment["url"]
                    else:
                        assert "path" in attachment
                        url = self.get_arcanum_url(attachment["path"])
                    if "name" in attachment:
                        name = attachment["name"]
                    else:
                        name = url.split("/")[-1].split("?")[0]
                    req.attach_url(url, alias=name)
        req.run()
        if self.verbose:
            print("Running query {} ({})".format(title, req.share_url))
        if kwargs["wait"]:
            self.wait(req, **kwargs)
        return req

    @staticmethod
    def _get_result(table):
        table.fetch_full_data()
        colnames = table.column_names
        result = []
        for row in table.rows:
            result.append(dict(list(zip(colnames, row))))
        return result

    def get_results(self, req, unwrap=True):
        tables = list(req.get_results())
        results = [self._get_result(table) for table in tables]
        if len(results) == 1 and unwrap:
            return results[0]
        return results

    def wait(self, req, **kwargs):
        req.wait_progress()
        if req.status != "COMPLETED":
            formatted_issue = "\n\n".join(
                [i.format_issue() for i in req.issues]
            )
            if (
                "transient state" in formatted_issue
                or "mount-unmount operation" in formatted_issue
                or "DB::NetException" in formatted_issue
                or "Attempt to read after eof" in formatted_issue
            ):
                sys.stderr.write(
                    "error in query {}: {}".format(
                        req.share_url, formatted_issue
                    )
                )
                time.sleep(60 * 5)
                return self.run(req.query, **kwargs)
            else:
                raise Exception(
                    "request {} has failed: {}".format(
                        req.share_url, formatted_issue
                    )
                )


def yql_run(query, token, maxtries=30, title=None):
    yql_headers = {
        "Content-Type": "application/json",
        "Authorization": "OAuth {}".format(token),
    }
    query_json = {"content": query, "action": "RUN", "type": "SQL"}
    if title:
        query_json["title"] = "{} | YQL".format(title)
    req = requests.post(
        "https://yql.yandex.net/api/v2/operations",
        json=query_json,
        headers=yql_headers,
    )
    id_ = req.json()["id"]
    print("running query {}".format(id_))
    status = req.json()["status"]
    tries = 0
    while status in {"PENDING", "RUNNING"} and tries < maxtries:
        req = requests.get(
            "https://yql.yandex.net/api/v2/operations/{}".format(id_),
            headers=yql_headers,
        )
        status = req.json()["status"]
        print("operation status is {}".format(status))
        if status not in {"PENDING", "RUNNING"}:
            break
        time.sleep(60)
        tries += 1
    if status != "COMPLETED":
        sys.stderr.write("operation {} failed: {}".format(id_, req.content))


def parseparams(value, pairsep="\t", kvsep="="):
    tabs = value.split(pairsep)
    result = {}
    for x in tabs:
        if len(x.split(kvsep)) > 1:
            result[x.split(kvsep)[0]] = kvsep.join(x.split(kvsep)[1:])
        else:
            result[x] = "SINGLE"
    return defaultdict(lambda: "", result)


def parse_cgi(url, prepend=None, extractfirst=True):
    if isinstance(url, unicode):
        url = url.encode("utf8", errors="replace")
    if prepend:
        url = "{}{}".format(prepend, url)
    try:
        parsed = urlparse.urlparse(url)
    except ValueError:
        return {}
    qs = urlparse.parse_qs(parsed.query)
    return {k: (v[0] if extractfirst else v) for k, v in qs.items()}


def tabulate(*args):
    return "\t".join(map(format, args))


def good_dump(x, fn):
    json.dump(
        x,
        codecs.open(fn, "w", "utf8"),
        indent=2,
        ensure_ascii=False,
        sort_keys=True,
    )


def _check_non_empty(yt, table):
    return yt.exists(table) and yt.get_attribute(table, "row_count") > 0


def get_stat_headers():
    if os.environ.get("STAT_LOGIN"):
        return {
            "StatRobotUser": os.environ["STAT_LOGIN"],
            "StatRobotPassword": os.environ["STAT_TOKEN"],
        }
    return {"Authorization": "OAuth {}".format(os.environ["STAT_TOKEN"])}


def get_missing_dates_from_stat(
    headers, report, date_min, date_max, scale="d", raw=False
):
    url = (
        "https://upload.stat.yandex-team.ru/_api/report/missing_dates"
        "?name={}&scale={}&date_min={}&date_max={}"
    ).format(report, scale, date_min, date_max)
    req = requests.get(url, headers=headers, verify=False)
    obj = req.json()
    if raw:
        return obj
    missing = [yt_get_date_from_table(x) for x in obj["missing"]]
    existing = [yt_get_date_from_table(x) for x in obj["exists"]]
    return (missing, existing)


def get_dates_from_stat(
    headers, report, dimensions=None, add_cgi=None, scale="d", raw=False
):
    url = (
        "https://upload.stat.yandex-team.ru/_api/report/available_dates"
        "?name={}&scale={}"
    ).format(report, scale)
    req = requests.get(url, headers=headers, verify=False)
    available_dates = req.json()["available_dates"]
    if raw:
        return available_dates
    return [yt_get_date_from_table(x) for x in available_dates]


def yt_config_set_defaults(yt, logger=None, pytz_off=False):
    exceptions = {"ujson", "Crypto.Cipher._Blowfish"}
    yt.config["proxy"]["url"] = "hahn.yt.yandex.net"
    yt.config["spec_defaults"]["pool"] = SPEC10k["pool"]
    yt.config["spec_defaults"]["ignore_existing"] = True
    yt.config["tabular_data_format"] = yt.YsonFormat()
    if "Anaconda" in sys.version:
        yt.config["pickling"]["module_filter"] = (
            lambda module: hasattr(module, "__file__")
            and (
                not module.__file__.endswith(".so")
                or getattr(module, "__name__", "") not in exceptions
            )
            and "hashlib" not in getattr(module, "__name__", "")
        )
    if pytz_off:
        yt.config["pickling"][
            "module_filter"
        ] = lambda module: "pytz" not in getattr(module, "__name__", "")
    yt.config["yamr_mode"]["create_recursive"] = True
    if logger:
        logging.getLogger("Yt").handlers = logger.handlers


def make_logger(_file, debug=False):
    directory = os.path.dirname(_file)
    basename = os.path.splitext(os.path.basename(_file))[0]
    logger = logging.getLogger(basename)
    os.chdir(directory)
    if not os.path.isdir("{}/logs".format(directory)):
        os.mkdir("{}/logs".format(directory))
    formatter = logging.Formatter("%(asctime)s | %(message)s")
    ch = logging.StreamHandler()
    logger.setLevel(logging.DEBUG)
    if debug:
        ch.setLevel(logging.DEBUG)
    else:
        ch.setLevel(logging.CRITICAL)
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    fh = logging.FileHandler(
        "{}/logs/{}_{}_pid_{}.log".format(
            directory,
            basename,
            datetime.datetime.now().strftime("%Y%m%d_%H%M%S"),
            os.getpid(),
        ),
        encoding="utf8",
    )
    fh.setLevel(logging.DEBUG)
    fh.setFormatter(formatter)
    logger.addHandler(fh)
    return logger


def date_range(from_, to_):
    if isinstance(from_, basestring):
        from_ = datetime.datetime.strptime(from_, "%Y-%m-%d").date()
    if isinstance(to_, basestring):
        to_ = datetime.datetime.strptime(to_, "%Y-%m-%d").date()
    mvr = min([from_, to_])
    result = []
    while mvr <= max([from_, to_]):
        result.append(mvr)
        mvr += datetime.timedelta(days=1)
    if to_ < from_:
        result = result[::-1]
    return result


def yt_date_to_ts(string_):
    return int(datetime.datetime.strptime(string_, "%Y-%m-%d").strftime("%s"))


re_date = re.compile(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")


def get_date(table):
    if re_date.search(table):
        return datetime.datetime.strptime(
            re_date.search(table).group(0), "%Y-%m-%d"
        ).date()


yt_get_date_from_table = get_date


def send_sms(recipients, message):
    url = "https://golem.yandex-team.ru/api/sms/send.sbml?resps={}&msg={}".format(
        ",".join(recipients),
        urllib.quote(message.encode("utf8").decode("utf8")),
    )
    r = requests.get(url)


def send_mail(msg, addressees, from_, subject):
    msg = MIMEText(msg)
    msg["Subject"] = "STRM Cube outdated"
    msg["From"] = from_
    msg["To"] = ",".join(addressees)
    s = smtplib.SMTP("outbound-relay.yandex.net")
    s.sendmail(msg["From"], addressees, msg.as_string())
    s.quit()


class StatPusher(object):
    def __init__(
        self,
        cluster,
        report=None,
        client=None,
        username_var="STAT_LOGIN",
        password_var="STAT_TOKEN",
        scale=None,
        replace_mask=None,
        remote_publish=None,
        async_mode=None,
    ):
        self.cluster = cluster
        self.driver = get_driver(self.cluster)
        if client:
            self.client = client
        else:
            kwargs = {
                "proxy": "upload.stat.yandex-team.ru",
                "token": os.environ[password_var],
            }
            if os.environ.get(username_var):
                kwargs["username"] = os.environ[username_var]
                kwargs["password"] = kwargs.pop("token")
            self.client = ns.StatfaceClient(**kwargs)
        self.report = report
        self.scale = scale
        self.remote_publish = remote_publish
        self.replace_mask = replace_mask
        self.async_mode = async_mode

    def push(
        self,
        data,
        report=None,
        replace_mask=None,
        scale=None,
        remote_publish=None,
        async_mode=False,
    ):
        scale = scale or self.scale
        if scale is None:
            scale = "daily"
        report = report or self.report
        if remote_publish is None:
            remote_publish = self.remote_publish
        async_mode = async_mode or self.async_mode
        if replace_mask is None:
            replace_mask = self.replace_mask
        if remote_publish is None and isinstance(data, basestring):
            row_count = (
                self.driver.client.get_attribute(data, "row_count", 0) or 0
            )
            print("row count is {}".format(row_count))
            remote_publish = row_count >= 30000
            print(
                "choosing {} publish method".format(
                    "remote" if remote_publish else "standard"
                )
            )
        if not remote_publish and isinstance(data, basestring):
            data = [x.to_dict() for x in self.driver.read(data)]
        rep = ns.StatfaceReport().path(report).client(self.client).scale(scale)

        if replace_mask:
            rep = rep.replace_mask(replace_mask)

        if remote_publish:
            rep = rep.remote_publish(
                proxy=self.cluster.proxy.split(".")[0],
                table_path=data,
                async_mode=async_mode,
                upload_config=False,
            )
        else:
            rep = rep.data(data).publish()


def get_driver(cluster):
    try:
        return cluster.driver.yt_driver
    except AttributeError:
        return cluster.driver


def get_cluster(clusters, args):
    if not isinstance(args, dict):
        args = vars(args)
    kwargs = {"token": os.environ["YT_TOKEN"]}
    if args.get("pool"):
        kwargs["pool"] = args["pool"]
    no_yql = args.get("no_yql")
    if no_yql:
        cluster = getattr(clusters, "yt")
    else:
        cluster = getattr(clusters, "yql")
        kwargs["yql_token"] = os.environ["YQL_TOKEN"]
    proxy = args.get("proxy") or os.environ["YT_PROXY"].split(".")[0].title()
    templates = {"title": args.get("title") or "default_title"}
    if args.get("job_root"):
        templates["job_root"] = args["job_root"]
    if args.get("templates") or {}:
        templates.update(args["templates"])
    yt_spec_defaults = args.get("yt_spec_defaults", {})
    cluster = getattr(cluster, proxy)(**kwargs).env(
        templates=templates, yt_spec_defaults=yt_spec_defaults
    )
    return cluster


def read_file(filepath, mode="utf8"):
    if mode in ("binary", "b"):
        opener = open(filepath, "r")
    elif mode in ("utf8", "u"):
        opener = codecs.open(filepath, "r", "utf8")
    else:
        raise AssertionError("mode should be either (b)inary or (u)tf8")
    with opener as f:
        cnt = f.read()
    return cnt


def main():
    pass


if __name__ == "__main__":
    main()
