#!/usr/bin/env python
# -*- coding: utf8 -*-

import logging
import datetime

from optparse import OptionParser
from yql.api.v1.client import YqlClient
from yt.wrapper import YtClient

RESULT_TABLE = "//home/yabs/toloka/TopURLs"
YT_CLUSTER = "hahn"
WAIT_YQL_MINS = 60
MAX_URLS_PER_SITE = 50
LIMIT_URLS = 50000


"""
По chevent+hit логам выбирает популярные урлы по количеству кликов.
Для того, чтобы потом эти урлы отправлять толокерам.
"""


logging.basicConfig(
    level=logging.INFO,
    format='[%(levelname)s %(asctime)s]: %(message)s',
    datefmt='%m-%d-%Y %I:%M:%S'
)


def get_urls():
    logging.info("get_urls")

    client = YqlClient(
        db=YT_CLUSTER,
        token=YQL_TOKEN,
    )

    prev_day = datetime.datetime.now() - datetime.timedelta(days=1)
    request = client.query("""
        $is_flat = Re2::Grep("flat\\-page");
        select
            H.referer,
            min(cast(E.pageid as int64)),
            count(*) as cnt
        from
            [//home/logfeller/logs/bs-chevent-log/1h/%sT12:00:00] as E
        inner join
            [//home/logfeller/logs/bs-hit-log/1h/%sT12:00:00] as H
        using
            (hitlogid)
        where
                E.fraudbits="0"
            and
                E.countertype="2"
            and
                $is_flat(E.options)=True
        group by
             H.referer
    """ % (prev_day.strftime("%Y-%m-%d"), prev_day.strftime("%Y-%m-%d")))
    request.run()

    if not request.get_results().is_success:
        error_description = '\n'.join([str(err) for err in request.get_results().errors])
        logging.error(error_description)
        raise RuntimeError(error_description)

    request.get_results().table.fetch_full_data()

    stat = {}
    for row in request.get_results().table.rows:
        ref = row[0]
        pageid = row[1]
        cnt = row[2]

        splitted = ref.split("?")[0].split("/")
        project = "/".join(splitted[0:4])
        domain_splitted = splitted[2].split(".")
        domain_splitted.reverse()
        for x in domain_splitted[1:]:
            site = x
            if site not in ('ru', 'com', 'by', 'kz', 'ua', 'net'):
                break

        if site not in stat:
            stat[site] = {
                "projects": {},
                "cnt": 0,
            }
        if project not in stat[site]["projects"]:
            stat[site]["projects"][project] = {
                "refs": {},
                "cnt": 0,
            }
        if ref not in stat[site]["projects"][project]["refs"]:
            stat[site]["projects"][project]["refs"][ref] = {
                "cnt": 0,
            }

        stat[site]["cnt"] += cnt
        stat[site]["projects"][project]["cnt"] += cnt
        stat[site]["projects"][project]["refs"][ref]["cnt"] += cnt
        stat[site]["projects"][project]["refs"][ref]["pageid"] = pageid

    urls = []
    for site in sorted(stat.keys(), key=lambda x: stat[x]["cnt"], reverse=True):
        site_urls = 0
        for project in sorted(stat[site]["projects"].keys(), key=lambda x: stat[site]["projects"][x]["cnt"], reverse=True):
            if "nanpu" in project or site_urls >= MAX_URLS_PER_SITE or len(urls) >= LIMIT_URLS:
                continue
            for ref in sorted(stat[site]["projects"][project]["refs"].keys(), key=lambda x: stat[site]["projects"][project]["refs"][x]["cnt"], reverse=True):
                urls.append({
                    "site": site,
                    "project": project,
                    "ref": ref,
                    "site_clicks": stat[site]["cnt"],
                    "project_clicks": stat[site]["projects"][project]["cnt"],
                    "ref_clicks": stat[site]["projects"][project]["refs"][ref]["cnt"],
                    "pageid": stat[site]["projects"][project]["refs"][ref]["pageid"],
                })
                site_urls += 1
                break

    return urls


def write_urls(urls):
    logging.info("write_urls")

    cfg = {
        "default_value_of_raw_option": False,
        "token": YT_TOKEN,
        "proxy": {"url": YT_CLUSTER},
    }
    ytc = YtClient(config=cfg)

    ytc.write_table(RESULT_TABLE, urls, raw=False)


if __name__ == "__main__":
    parser = OptionParser()
    parser.add_option("--yql_token", help="YQL auth token")
    parser.add_option("--yt_token", help="YT auth token")
    options = parser.parse_args()[0]

    YQL_TOKEN = options.yql_token
    YT_TOKEN = options.yt_token

    urls = get_urls()
    write_urls(urls)
