# -*- coding: utf-8 -*-
import datetime
import os

from nile.api.v1 import Record
from passport.backend.profile import (
    extractors as pe,
    get_cluster,
)
from passport.backend.profile.utils.helpers import (
    browser_grouper,
    cut_host,
    probability,
    probability_grouped,
    to_date_str,
    truncate_timestamp,
)
from passport.backend.profile.utils.parsers import parse_yandexuid_timestamp
from passport.backend.profile.utils.yt import get_yt
from qb2.api.v1 import (
    extractors as se,
    filters as sf,
)
from retrying import retry
import yt.wrapper as yt
from yt.wrapper.errors import YtIncorrectResponse


class CaptchaOrigin(object):
    BRUTEFORCE = 1
    MICROPROFILE = 2


def create_auth_record(track_id, target, challenge_origin, record, submitted_record):
    return Record(
        track_id,
        target=target,
        challenge_origin=challenge_origin,

        uid=record.get('uid'),
        yandexuid=record.get('yandexuid'),

        ip=record.get('ip'),
        geo_id=record.get('geo_id'),
        is_black_ip=record.get('is_black_ip'),
        city=record.get('city_id'),
        country=record.get('country_id'),
        as_list=record.get('as_list'),

        retpath_host_2=submitted_record.get('retpath_host_2'),
        retpath_host_3=submitted_record.get('retpath_host_3'),
        referer_host_2=submitted_record.get('referer_host_2'),
        referer_host_3=submitted_record.get('referer_host_3'),

        unixtime=record['unixtime'],
        day_part=record['day_part'],
        weekday=record['weekday'],
        is_weekend=record['is_weekend'],
        hour=record['hour'],

        browser_name=record.get('browser_name'),
        browser_version=record.get('browser_version'),
        os_family=record.get('os_family'),
        os_name=record.get('os_name'),
        os_version=record.get('os_version'),
        is_mobile=record.get('is_mobile'),

        is_password_change_required=record.get('is_password_change_required'),
    )


def track_reducer(groups):
    for track_id, records in groups:
        submitted_record = successful_auth_record = microprofile_challenge_record = bruteforce_challenge_record = None

        for record in records:
            if submitted_record is None and record['action'] == 'submitted':
                submitted_record = record
            if successful_auth_record is None and record['action'] == 'cookie_set':
                successful_auth_record = record
            if microprofile_challenge_record is None and record['action'] == 'profile_threshold_exceeded':
                microprofile_challenge_record = record
            if bruteforce_challenge_record is None and record['action'] == 'failed_auth':
                bruteforce_challenge_record = record

        if not submitted_record:
            # если нет записи action=submitted, то это не авторизация
            continue

        if successful_auth_record:
            challenge_origin = CaptchaOrigin.MICROPROFILE if successful_auth_record.get('is_auth_challenge_shown') else CaptchaOrigin.BRUTEFORCE

            # "хорошая" авторизация
            yield create_auth_record(
                track_id=track_id,
                target=0,
                challenge_origin=challenge_origin,
                record=successful_auth_record,
                submitted_record=submitted_record,
            )
        elif microprofile_challenge_record or bruteforce_challenge_record:
            challenge_origin = CaptchaOrigin.MICROPROFILE if microprofile_challenge_record else CaptchaOrigin.BRUTEFORCE

            failed_auth_record = microprofile_challenge_record or bruteforce_challenge_record
            # "плохая" авторизация
            yield create_auth_record(
                track_id=track_id,
                target=1,
                challenge_origin=challenge_origin,
                record=failed_auth_record,
                submitted_record=submitted_record,
            )


def auths_reducer(groups):
    for key, records in groups:
        failed_auth_record = None

        for record in records:
            if record['target'] == 0:  # successful auth
                yield record
                break
            elif failed_auth_record is None:   # failed auth
                failed_auth_record = record
        else:
            yield failed_auth_record


def passport_probabilities_by_time(suffix):
    return [
        se.log_field('succ_auth_count_%s' % suffix),
        se.log_field('captcha_passed_%s' % suffix),

        se.log_field('ip_freq_%s' % suffix).hide(),
        se.log_field('country_freq_%s' % suffix).hide(),
        se.log_field('city_freq_%s' % suffix).hide(),
        se.log_field('as_list_freq_%s' % suffix).hide(),
        se.log_field('day_part_freq_%s' % suffix).hide(),
        se.log_field('weekday_freq_%s' % suffix).hide(),
        se.log_field('os_family_freq_%s' % suffix).hide(),
        se.log_field('os_name_freq_%s' % suffix).hide(),
        se.log_field('browser_freq_%s' % suffix).hide(),
        se.log_field('browser_name_freq_%s' % suffix).hide(),
        se.log_field('browser_os_freq_%s' % suffix).hide(),
        se.log_field('is_mobile_freq_%s' % suffix).hide(),
        se.log_field('yandexuid_freq_%s' % suffix).hide(),
        se.log_field('retpath_host_2_freq_%s' % suffix).hide(),
        se.log_field('retpath_host_3_freq_%s' % suffix).hide(),
        se.log_field('referer_host_2_freq_%s' % suffix).hide(),
        se.log_field('referer_host_3_freq_%s' % suffix).hide(),

        # как выяснилось не дают ожидаемого эффекта
        se.custom('country_count_%s' % suffix, len, 'country_freq_%s' % suffix),
        se.custom('city_count_%s' % suffix, len, 'city_freq_%s' % suffix),
        se.custom('browser_name_count_%s' % suffix, len, 'browser_name_freq_%s' % suffix),
        se.custom('os_name_count_%s' % suffix, len, 'os_name_freq_%s' % suffix),
        se.custom('yandexuid_count_%s' % suffix, len, 'yandexuid_freq_%s' % suffix),
        se.custom('retpath_host_3_count_%s' % suffix, len, 'retpath_host_3_freq_%s' % suffix),
        se.custom('referer_host_3_count_%s' % suffix, len, 'referer_host_3_freq_%s' % suffix),

        se.custom('ip_prob_%s' % suffix, probability, 'ip', 'ip_freq_%s' % suffix),
        se.custom('country_prob_%s' % suffix, probability, 'country', 'country_freq_%s' % suffix),
        se.custom('city_prob_%s' % suffix, probability, 'city', 'city_freq_%s' % suffix),
        se.custom('as_list_prob_%s' % suffix, probability, 'as_list', 'as_list_freq_%s' % suffix),
        se.custom('day_part_prob_%s' % suffix, probability, 'day_part', 'day_part_freq_%s' % suffix),
        se.custom('weekday_prob_%s' % suffix, probability, 'weekday', 'weekday_freq_%s' % suffix),
        se.custom('os_family_prob_%s' % suffix, probability, 'os_family', 'os_family_freq_%s' % suffix),
        se.custom('os_name_prob_%s' % suffix, probability, 'os_name', 'os_name_freq_%s' % suffix),
        se.custom('browser_prob_%s' % suffix, probability, 'browser', 'browser_freq_%s' % suffix),
        se.custom('browser_grouped_prob_%s' % suffix, probability_grouped(browser_grouper), 'browser', 'browser_freq_%s' % suffix),
        se.custom('browser_name_prob_%s' % suffix, probability, 'browser_name', 'browser_name_freq_%s' % suffix),
        se.custom('browser_os_prob_%s' % suffix, probability, 'browser_os', 'browser_os_freq_%s' % suffix),
        se.custom('is_mobile_prob_%s' % suffix, probability, 'is_mobile', 'is_mobile_freq_%s' % suffix),
        se.custom('yandexuid_prob_%s' % suffix, probability, 'yandexuid', 'yandexuid_freq_%s' % suffix),
        se.custom('retpath_host_2_prob_%s' % suffix, probability, 'retpath_host_2', 'retpath_host_2_freq_%s' % suffix),
        se.custom('retpath_host_3_prob_%s' % suffix, probability, 'retpath_host_3', 'retpath_host_3_freq_%s' % suffix),
        se.custom('referer_host_2_prob_%s' % suffix, probability, 'referer_host_2', 'referer_host_2_freq_%s' % suffix),
        se.custom('referer_host_3_prob_%s' % suffix, probability, 'referer_host_3', 'referer_host_3_freq_%s' % suffix),
    ]


def blackbox_probabilities_by_time(suffix):
    return [
        se.log_field('su_ip_freq_%s' % suffix).hide(),
        se.log_field('su_country_freq_%s' % suffix).hide(),
        se.log_field('su_city_freq_%s' % suffix).hide(),
        se.log_field('su_as_list_freq_%s' % suffix).hide(),
        se.log_field('su_day_part_freq_%s' % suffix).hide(),
        se.log_field('su_weekday_freq_%s' % suffix).hide(),

        se.log_field('su_os_family_freq_%s' % suffix).hide(),
        se.log_field('su_os_name_freq_%s' % suffix).hide(),
        se.log_field('su_browser_freq_%s' % suffix).hide(),
        se.log_field('su_browser_name_freq_%s' % suffix).hide(),
        se.log_field('su_browser_os_freq_%s' % suffix).hide(),
        se.log_field('su_is_mobile_freq_%s' % suffix).hide(),
        se.log_field('su_referer_host_2_freq_%s' % suffix).hide(),
        se.log_field('su_referer_host_3_freq_%s' % suffix).hide(),
        # вероятности
        se.custom('su_ip_prob_%s' % suffix, probability, 'ip', 'su_ip_freq_%s' % suffix),
        se.custom('su_country_prob_%s' % suffix, probability, 'country', 'su_country_freq_%s' % suffix),
        se.custom('su_city_prob_%s' % suffix, probability, 'city', 'su_city_freq_%s' % suffix),
        se.custom('su_as_list_prob_%s' % suffix, probability, 'as_list', 'su_as_list_freq_%s' % suffix),
        se.custom('su_day_part_prob_%s' % suffix, probability, 'day_part', 'su_day_part_freq_%s' % suffix),
        se.custom('su_weekday_prob_%s' % suffix, probability, 'weekday', 'su_weekday_freq_%s' % suffix),
        se.custom('su_os_family_prob_%s' % suffix, probability, 'os_family', 'su_os_family_freq_%s' % suffix),
        se.custom('su_os_name_prob_%s' % suffix, probability, 'os_name', 'su_os_name_freq_%s' % suffix),

        se.custom('su_browser_prob_%s' % suffix, probability, 'browser', 'su_browser_freq_%s' % suffix),
        se.custom('su_browser_name_prob_%s' % suffix, probability, 'browser_name', 'su_browser_name_freq_%s' % suffix),

        se.custom('su_is_mobile_prob_%s' % suffix, probability, 'is_mobile', 'su_is_mobile_freq_%s' % suffix),
        se.custom('su_browser_os_prob_%s' % suffix, probability, 'browser_os', 'su_browser_os_freq_%s' % suffix),
    ]


@retry(stop_max_attempt_number=3, wait_fixed=5000, retry_on_exception=(YtIncorrectResponse,))
def prepare_auth_dataset(config, date):
    tmp_joined_table = os.path.join(
        config['yt']['tmp_dir'],
        'joined-auth-userprofiles-%s' % to_date_str(date),
    )

    profile_date = date - datetime.timedelta(days=1)
    profile_table_name = os.path.join(config['yt']['profile_dir'], to_date_str(profile_date))

    cluster = get_cluster(config).env(
        templates=dict(
            date=to_date_str(date),
        ),
    )

    job_dataset = cluster.job()
    passport_log = job_dataset.table(os.path.join(config['yt']['passport_log_dir'], '$date'))

    fields = [
        'track_id',
        se.log_fields(
            'action', 'type', 'yandexuid', 'user_agent', 'ip',
            'retpath', 'referer',
            'bruteforce', 'login_status', 'password_status',
        ),
        se.log_field('retpath').hide(),
        se.log_field('referer').hide(),
        se.integer_log_fields(
            'unixtime', 'uid', 'is_auth_challenge_shown',
            'is_password_change_required', 'captcha_passed',
        ),
        pe.canonized_host('retpath_host', 'retpath').hide(),
        pe.canonized_host('referer_host', 'referer').hide(),

        se.custom('retpath_host_2', lambda host: cut_host(host, 2), 'retpath_host'),
        se.custom('retpath_host_3', lambda host: cut_host(host, 3), 'retpath_host'),

        se.custom('referer_host_2', lambda host: cut_host(host, 2), 'referer_host'),
        se.custom('referer_host_3', lambda host: cut_host(host, 3), 'referer_host'),
    ]
    fields.extend(pe.date_and_time())
    fields.extend(pe.useragent())
    fields.extend(pe.geo())

    # выбираем события проверки паролей
    passport_log.qb2(
        log='passport-log',
        fields=fields,
        filters=[
            sf.or_(
                # событие отправки авторизационной формы
                sf.and_(
                    sf.equals('mode', 'any_auth'),
                    sf.equals('action', 'submitted'),
                    sf.equals('type', 'password'),
                ),
                # успешные авторизации после прохождения challenge
                sf.and_(
                    sf.equals('mode', 'any_auth'),
                    sf.equals('action', 'cookie_set'),
                    sf.equals('captcha_passed', 1),
                ),
                # # Показ каптчи c правильной парой логин-пароль
                sf.and_(
                    sf.equals('mode', 'any_auth'),
                    sf.equals('action', 'failed_auth'),
                    sf.equals('bruteforce', 'captcha'),
                    sf.equals('login_status', 'VALID'),
                    sf.equals('password_status', 'VALID'),
                ),
                # Показ challenge по данным микропрофиля
                sf.and_(
                    sf.equals('mode', 'any_auth'),
                    sf.equals('action', 'profile_threshold_exceeded'),
                ),
            ),
        ],
    ).groupby('track_id').reduce(track_reducer).join(
        table=job_dataset.table(profile_table_name),
        by='uid',
        assume_unique_right=True,
    ).put(tmp_joined_table)

    job_dataset.run()

    job_features = cluster.job()
    auth_userprofile = job_features.table(tmp_joined_table)
    auth_userprofile.qb2(
        log='passport-log',
        fields=[
            se.log_fields(
                'track_id',
                'target',
                'challenge_origin',

                'uid',
                'yandexuid',

                'ip',
                'geo_id',
                'is_black_ip',
                'city',
                'country',
                'as_list',

                'retpath_host_2',
                'retpath_host_3',
                'referer_host_2',
                'referer_host_3',

                'unixtime',
                'day_part',
                'weekday',
                'is_weekend',
                'hour',

                'browser_name',
                'browser_version',
                'os_family',
                'os_name',
                'os_version',
                'is_mobile',

                'is_password_change_required',
                'bruteforce',
                'login_status',
                'password_status',
            ),
            se.custom('browser', lambda browser_name, browser_version: str(browser_name) + " " + str(browser_version), 'browser_name', 'browser_version'),
            se.custom('browser_os', lambda browser, os_name: str(browser) + " - " + str(os_name), 'browser', 'os_name'),
            se.custom('yandexuid_ts', parse_yandexuid_timestamp, 'yandexuid'),
            se.custom('yandexuid_ts_freshness', lambda unixtime, yandexuid_ts: unixtime - yandexuid_ts if yandexuid_ts is not None else -1, 'unixtime', 'yandexuid_ts'),
            se.custom('is_yandexuid_ts_future', lambda unixtime, yandexuid_ts: int(yandexuid_ts > unixtime) if yandexuid_ts is not None else -1, 'unixtime', 'yandexuid_ts'),

            se.custom('unixtime_20min', lambda unixtime: truncate_timestamp(unixtime, 20 * 60), 'unixtime'),
        ]
        + passport_probabilities_by_time('1d')
        + passport_probabilities_by_time('1w')
        + passport_probabilities_by_time('1m')
        + passport_probabilities_by_time('3m')
        + passport_probabilities_by_time('6m')
        + blackbox_probabilities_by_time('1d')
        + blackbox_probabilities_by_time('1w')
        + blackbox_probabilities_by_time('1m')
        + blackbox_probabilities_by_time('3m')
        + blackbox_probabilities_by_time('6m'),
        filters=[
            # выбираем только те события, у чьих пользователей был
            # посчитанный профиль
            sf.or_(
                sf.not_(sf.equals('ip_prob_3m', None)),
                sf.not_(sf.equals('su_ip_prob_3m', None)),
            ),
        ],
        intensity='cpu',
    ).groupby(
        'uid', 'ip', 'browser_name', 'browser_version', 'unixtime_20min',
    ).reduce(
        auths_reducer,
    ).put(
        os.path.join(config['yt']['train_dataset_batches_dir'], to_date_str(date)),
    )
    job_features.run()

    # удаляем временную таблицу для join'а
    yt_client = get_yt(config=config)
    yt.remove(tmp_joined_table, client=yt_client)
