#!/usr/bin/env python
# -*- coding: utf-8 -*-
# from __future__ import unicode_literals
from __future__ import division
import sys
import os
import codecs
import argparse
from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)
import datetime
import getpass

import itertools

from pytils import date_range, make_logger

from qb2.api.v1 import extractors as se, filters as sf

from collections import Counter
import math

import nile
import pandas as pd
from monytoring import Monitoring


date_format = '%Y-%m-%d'
DATE = datetime.datetime.today() + datetime.timedelta(days=-1)
# DATE = datetime.date(2017, 3, 15)
date = DATE.strftime(date_format)

username = getpass.getuser()
job_root = '//home/atom/' + username + '/distr_metric_x/web'
cluster = clusters.Hahn(
    pool='search-research_{}'.format(username),
).env(
    templates=dict(
        job_root=job_root
    )
)


def choose_interval(num, intervals):
    return [interval for interval in intervals
            if num >= interval[0] and num <= interval[1]][0]


def counter_quantile(counter, quantile):
    keys = sorted([x for x in counter if counter[x] != 0])
    kmapper = {}
    mover = 0
    mover_prev = 0
    for k in keys:
        mover = mover_prev + counter[k] - 1
        kmapper[(mover_prev, mover)] = k
        mover_prev = mover + 1
    length = sum(counter.values()) - 1  # not gonna work with zero counters
    if length <= 0:
        return 0
    target = length * quantile
    if int(target) == target:
        return kmapper[choose_interval(target, kmapper)]
    else:
        return ((kmapper[choose_interval(math.floor(target), kmapper)] +
                 kmapper[choose_interval(math.ceil(target), kmapper)]) / 2.0)


def process_distr_obj(distr_obj, referer):
    if distr_obj != 'portal_popup':
        return distr_obj
    if referer == 'yandex.ru':
        return 'portal_popup_morda'
    elif referer == 'yandex.ru/search':
        return 'portal_popup_serp'
    return distr_obj


def get_plat(counter):
    if counter == '731962':
        return 'desktop'
    elif counter == '22555771':
        return 'touch'
    elif counter == '23474449':
        return 'touch'
    elif counter == '722545':
        return 'desktop'
    else:
        return '-'


def get_service(counter):
    if counter == '731962':
        return 'search'
    elif counter == '22555771':
        return 'search'
    elif counter == '23474449':
        return 'morda'
    elif counter == '722545':
        return 'morda'
    else:
        return '-'


def get_plat_os(plat_, parsed_os):
    if plat_ != 'touch':
        return plat_
    if (
        'android' in (parsed_os.get('OSFamily') or '').lower() or
        'android' in (parsed_os.get('OSName') or '').lower()
    ):
        return 'touch_android'
    if (
        'ios' in (parsed_os.get('OSFamily') or '').lower() or
        'ios' in (parsed_os.get('OSName') or '').lower()
    ):
        return 'touch_ios'
    return plat_


def reduce_hits(groups):
    for key, records in groups:
        result = Counter()
        for rec in records:
            result['{}/{}'.format(rec.plat, rec.service)] += rec.hits
        yield Record(yandexuid=key.yandexuid, hits=dict(result))


def reduce_banners(groups):
    for key, records in groups:
        result = Counter()
        for rec in records:
            result[rec.distr_obj] += 1
        yield Record(yandexuid=key.yandexuid, shows=dict(result))


class CheckUidAge(object):

    def __init__(self, threshold):
        self.threshold = int(threshold)

    def __call__(self, uid, timestamp):
        try:
            timestamp = int(timestamp)
        except:
            return True
        try:
            ts_uid = int((uid or '')[-10:])
        except:
            return True
        if abs(timestamp - ts_uid) <= self.threshold:
            return False
        return True


def time_splitter(records, t1, t2, t3):
    hour_clean = CheckUidAge(3600)
    day_clean = CheckUidAge(86400)
    for rec in records:
        t1(rec)
        if hour_clean(rec.yandexuid, rec.unixtime):
            t2(rec)
        if day_clean(rec.yandexuid, rec.unixtime):
            t3(rec)


def process_hits(stream):
    return stream.groupby(
        'yandexuid', 'plat', 'service'
    ).aggregate(
        hits=na.count()
    ).groupby(
        'yandexuid'
    ).reduce(
        reduce_hits
    )


def process_banners(stream):
    return stream.groupby(
        'yandexuid'
    ).reduce(
        reduce_banners
    )


def prepare_stats(records):
    for rec in records:
        total_hits = sum(rec.hits.values())
        rec.hits['_total_/_total_'] = total_hits
        total_shows = sum(rec.shows.values())
        rec.shows['_total_'] = total_shows
        hits_group = ['raw']
        for i in [2, 3, 5, 8, 13, 21, 34, 55]:
            if total_hits >= i:
                hits_group.append('{}hits+'.format(i))
        for comb in itertools.product(
            sorted(rec.hits),
            sorted(rec.shows)
        ):
            plat, service = comb[0].split('/')
            distr_obj = comb[1]
            hits = rec.hits[comb[0]]
            shows = rec.shows[comb[1]]
            if hits == 0:
                ratio = 0
            else:
                ratio = round(shows / float(hits), 2)
            if ratio > 1:
                continue
            for group in hits_group:
                yield Record(
                    age_filter=rec.cleanness,
                    yandexuid=rec.yandexuid,
                    plat=plat,
                    distr_obj=distr_obj,
                    service=service,
                    ratio=ratio,
                    shows=shows,
                    hits=hits,
                    hits_filter=group
                )


def make_stat(groups):
    for key, records in groups:
        c = Counter()
        users = 0
        shows = 0
        hits = 0
        prev_user = ''
        for rec in records:
            if rec.yandexuid != prev_user:
                users += 1
            c[rec.ratio] += 1
            shows = shows + rec.shows
            hits = hits + rec.hits
            prev_user = rec.yandexuid
        result = vars(key)
        result['shows_per_user'] = round(shows / float(users), 2)
        result['mean_shows'] = min(round(shows / float(hits), 2), 1)
        for i in range(1, 10):
            result['q{}'.format(i * 10)] = round(
                counter_quantile(c, i * 0.1), 2
            )
        yield Record(**result)


def b(s):
    return s.encode('utf8')


BANNED = [
    'footer', 'teaser', 'softlink', 'bannermedia', 'soft_link', 'notification'
]


def check_distr_obj_(distr_obj):
    distr_obj = distr_obj or ''
    for x in BANNED:
        if x in distr_obj:
            return False
    return True


def check_distr_obj(distr_obj):
    distr_obj = distr_obj or ''
    if distr_obj in {
        'smart-banner', 'portal_popup', 'default_search', 'distr_stripe'
    }:
        return True
    return False


class GetQuantile(object):
    pass


def parse_date(d):
    return datetime.datetime.strptime(d, '%Y-%m-%d').date()


GOOD_REFERERS = {
    'yandex.ru/search',
    'yandex.ru/touchsearch',
    'yandex.ru'
}


def main():
    global DATE
    global date
    parser = argparse.ArgumentParser()
    parser.set_defaults(count=True)
    parser.set_defaults(publish=False)
    parser.add_argument('--from', default=None)
    parser.add_argument('--debug', action='store_true')
    parser.add_argument('--nolock', action='store_true')
    parser.add_argument('--to', default=None)
    args = parser.parse_args()

    logger = make_logger(os.path.abspath(__file__), debug=args.debug)

    monitoring = Monitoring(
        prefix='',
        filepath=__file__,
        default_id='2017-04-05',
        process_table=process_date,
        logger=logger,
        mode='dates',
        confirmation=True
    )

    monitoring.start(args)


def process_date(date, days=6, logger=None):
    job = cluster.job()

    wl_tables = []
    banners_tables = []
    for date_ in pd.date_range(
        date - datetime.timedelta(days=days), date
    ):
        if cluster.driver.exists(
            'statbox/bs-watch-log/{}'.format(date_.date())
        ):
            wl_tables.append(
                job.table(
                    'statbox/bs-watch-log/{}'.format(date_.date()),
                    ignore_missing=True
                )
            )
        elif cluster.driver.exists(
            'statbox/bs-watch-log.6p/{}'.format(date_.date())
        ):
            wl_tables.append(
                job.table(
                    'statbox/bs-watch-log.6p/{}'.format(date_.date()),
                    ignore_missing=True
                )
            )
        else:
            raise Exception('Tables {} not found'.format(
                'statbox/bs-watch-log[.6p]/{}'.format(date_.date())
            ))
        banners_tables.append(
            job.table(
                'home/personalization/v4_daily/{}/atom_banners'.format(
                    date_.date()
                ),
                ignore_missing=True
            )
        )

    wl = job.concat(*wl_tables)

    banners = job.concat(*banners_tables).filter(
        nf.and_(
            nf.equals('eventtype', 'show'),
            nf.not_(nf.equals('yesterday', 'True')),
            nf.custom(check_distr_obj, 'distr_obj'),
            nf.custom(lambda x: (x or '') in GOOD_REFERERS, 'referer')
        )
    ).project(
        'yandexuid', 'unixtime',
        distr_obj=ne.custom(process_distr_obj, 'distr_obj', 'referer')
    )

    banners_all, banners_hour, banners_day = banners.map(
        time_splitter
    )

    banners_all = process_banners(banners_all).put(
        '$job_root/{}/banners_all'.format(date)
    )
    banners_hour = process_banners(banners_hour).put(
        '$job_root/{}/banners_hour'.format(date)
    )
    banners_day = process_banners(banners_day).put(
        '$job_root/{}/banners_day'.format(date)
    )

    hits = wl.qb2(
        log='bs-watch-log',
        fields=[
            'uid', 'canonized_vhost', 'domain', 'timestamp',
            'page', 'counter_id', 'parsed_os',
            se.custom('service', get_service, 'counter_id'),
            se.custom('plat_', get_plat, 'counter_id')
        ],
        filters=[
            sf.default_filtering('bs-watch-log'),
            sf.defined('uid'),
            sf.one_of(
                'counter_id', {'731962', '22555771', '23474449', '722545'}
            )
        ]
    ).project(
        'service', unixtime='timestamp', yandexuid='uid',
        plat=ne.custom(get_plat_os, 'plat_', 'parsed_os')
    ).put(
        '$job_root/{}/hits_base'.format(date)
    )

    job.run()

    job = cluster.job()

    hits = job.table(
        '$job_root/{}/hits_base'.format(date)
    )

    hits_all, hits_hour, hits_day = hits.map(
        time_splitter
    )

    hits_all = process_hits(hits_all).put(
        '$job_root/{}/hits_all'.format(date)
    )
    hits_hour = process_hits(hits_hour).put(
        '$job_root/{}/hits_hour'.format(date)
    )
    hits_day = process_hits(hits_day).put(
        '$job_root/{}/hits_day'.format(date)
    )

    job.run()

    cluster.driver.remove('{}/{}/hits_base'.format(job_root, date))

    job = cluster.job()

    banners_all = job.table('$job_root/{}/banners_all'.format(date))
    banners_hour = job.table('$job_root/{}/banners_hour'.format(date))
    banners_day = job.table('$job_root/{}/banners_day'.format(date))
    hits_all = job.table('$job_root/{}/hits_all'.format(date))
    hits_hour = job.table('$job_root/{}/hits_hour'.format(date))
    hits_day = job.table('$job_root/{}/hits_day'.format(date))

    hits_banners_all = hits_all.project(
        ne.all(), cleanness=ne.const('raw')
    ).join(
        banners_all, by='yandexuid', type='inner'
    )

    hits_banners_hour = hits_hour.project(
        ne.all(), cleanness=ne.const('hour_clean')
    ).join(
        banners_hour, by='yandexuid', type='inner'
    )

    hits_banners_day = hits_day.project(
        ne.all(), cleanness=ne.const('day_clean')
    ).join(
        banners_day, by='yandexuid', type='inner'
    )

    hits_banners_joined = job.concat(
        hits_banners_all, hits_banners_hour, hits_banners_day
    ).put('$job_root/{}/hits_banners_joined'.format(date))

    stats_prepared = hits_banners_joined.map(
        prepare_stats
    )

    stat = stats_prepared.groupby(
        'age_filter', 'hits_filter', 'distr_obj', 'plat', 'service'
    ).sort('yandexuid').reduce(
        make_stat
    ).project(
        ne.all(), fielddate=ne.const(format(date))
    ).put(
        '$job_root/{}/report'.format(date)
    )

    job.run()

    client = ns.StatfaceClient(
        proxy='upload.stat.yandex-team.ru',
        username='robot_ensuetina',
        password='woh8Ahshae'
    )

    report = ns.StatfaceReport().path(
        'Yandex_RU/Others/distribution_metric/report_experimental_2'
    ).scale('daily')

    report = report.client(client)

    report = report.data(
        cluster.read('$job_root/{}/report'.format(date))
    )

    report.publish()

    return True


if __name__ == "__main__":
    main()
