# -*- coding: utf-8 -*-
import os

from dateutil import rrule
from passport.backend.profile import get_cluster
from passport.backend.profile.jobs.prepare_dataset_for_tensornet import prepare_balanced_dataset
from passport.backend.profile.utils.helpers import to_date_str
from passport.backend.profile.utils.parsers import (
    int_missing,
    probability,
)
from passport.backend.profile.utils.yt import get_yt
import yt.wrapper as yt


SHOW_PROCESS_AFTER_LINES = 10000
DEFAULT_TIMERANGES = ('1d', '1w', '1m')
TIMERANGES_PROB_PASSPORT = ('1d', '1w', '1m', '3m', '6m')
DEFAULT_TIMERANGES_BLACKBOX = ('1d', '1w', '1m', '3m', '6m')


def features_with_probabilities(name, timeranges=None, type_=None, converter=None):
    features = [(name, type_, converter)] if type_ else []
    features += features_timerange(name + '_prob', timeranges, type_=None, converter=probability)
    return features


def features_timerange(name, timeranges=None, type_=None, converter=None):
    if timeranges is None:
        timeranges = DEFAULT_TIMERANGES

    return [('%s_%s' % (name, timerange), type_, converter) for timerange in timeranges]


FEATURE_TYPE_MAPPING = [
    # ('challenge_origin', None),  # не будет этого фактора при использовании модели в продакшене
    # ('geo_id', 'categ', int_missing()),
    ('hour', None, int_missing()),
    ('is_weekend', None, int_missing()),

    ('is_mobile', None, int_missing()),

    # ('yandexuid_ts', None, int_missing(),
    ('yandexuid_ts_freshness', None, int_missing()),
    # ('browser_version', 'categ', None),
    # ('month', 'categ'),
]

FEATURE_TYPE_MAPPING += features_timerange('succ_auth_count', converter=int_missing(missing=0))
FEATURE_TYPE_MAPPING += features_timerange('captcha_passed', converter=int_missing(missing=0))

FEATURE_TYPE_MAPPING += features_timerange('browser_prob', TIMERANGES_PROB_PASSPORT, converter=probability)
FEATURE_TYPE_MAPPING += features_timerange('browser_name_prob', TIMERANGES_PROB_PASSPORT, converter=probability)
FEATURE_TYPE_MAPPING += features_timerange('browser_os_prob', TIMERANGES_PROB_PASSPORT, converter=probability)
FEATURE_TYPE_MAPPING += features_timerange('browser_grouped_prob', TIMERANGES_PROB_PASSPORT, converter=probability)

FEATURE_TYPE_MAPPING += features_timerange('os_name_prob', TIMERANGES_PROB_PASSPORT, converter=probability)
FEATURE_TYPE_MAPPING += features_timerange('os_family_prob', TIMERANGES_PROB_PASSPORT, converter=probability)

FEATURE_TYPE_MAPPING += features_timerange('ip_prob', TIMERANGES_PROB_PASSPORT, converter=probability)
FEATURE_TYPE_MAPPING += features_timerange('city_prob', TIMERANGES_PROB_PASSPORT, converter=probability)
FEATURE_TYPE_MAPPING += features_timerange('country_prob', TIMERANGES_PROB_PASSPORT, converter=probability)

# FEATURE_TYPE_MAPPING += features_with_probabilities('day_part', type_='categ', converter=int_missing())
# FEATURE_TYPE_MAPPING += features_with_probabilities('weekday', type_='categ', converter=int_missing())

FEATURE_TYPE_MAPPING += features_timerange('retpath_host_2_prob', TIMERANGES_PROB_PASSPORT, converter=probability)
FEATURE_TYPE_MAPPING += features_timerange('retpath_host_3_prob', TIMERANGES_PROB_PASSPORT, converter=probability)
FEATURE_TYPE_MAPPING += features_timerange('referer_host_2_prob', TIMERANGES_PROB_PASSPORT, converter=probability)
FEATURE_TYPE_MAPPING += features_timerange('referer_host_3_prob', TIMERANGES_PROB_PASSPORT, converter=probability)

FEATURE_TYPE_MAPPING += features_timerange('yandexuid_prob', TIMERANGES_PROB_PASSPORT, converter=probability)
FEATURE_TYPE_MAPPING += features_timerange('as_list_prob', TIMERANGES_PROB_PASSPORT, converter=probability)

FEATURE_TYPE_MAPPING += features_timerange('country_count', converter=int_missing(missing=0))
FEATURE_TYPE_MAPPING += features_timerange('city_count', converter=int_missing(missing=0))
FEATURE_TYPE_MAPPING += features_timerange('browser_name_count', converter=int_missing(missing=0))
FEATURE_TYPE_MAPPING += features_timerange('os_name_count', converter=int_missing(missing=0))
FEATURE_TYPE_MAPPING += features_timerange('yandexuid_count', converter=int_missing(missing=0))
FEATURE_TYPE_MAPPING += features_timerange('retpath_host_3_count', converter=int_missing(missing=0))
FEATURE_TYPE_MAPPING += features_timerange('referer_host_3_count', converter=int_missing(missing=0))

# Blackbox
FEATURE_TYPE_MAPPING += features_timerange('su_ip_prob', DEFAULT_TIMERANGES_BLACKBOX, converter=probability)
FEATURE_TYPE_MAPPING += features_timerange('su_country_prob', DEFAULT_TIMERANGES_BLACKBOX, converter=probability)
FEATURE_TYPE_MAPPING += features_timerange('su_city_prob', DEFAULT_TIMERANGES_BLACKBOX, converter=probability)
FEATURE_TYPE_MAPPING += features_timerange('su_as_list_prob', DEFAULT_TIMERANGES_BLACKBOX, converter=probability)
FEATURE_TYPE_MAPPING += features_timerange('su_day_part_prob', DEFAULT_TIMERANGES_BLACKBOX, converter=probability)
FEATURE_TYPE_MAPPING += features_timerange('su_weekday_prob', DEFAULT_TIMERANGES_BLACKBOX, converter=probability)
FEATURE_TYPE_MAPPING += features_timerange('su_browser_prob', DEFAULT_TIMERANGES_BLACKBOX, converter=probability)
FEATURE_TYPE_MAPPING += features_timerange('su_browser_name_prob', DEFAULT_TIMERANGES_BLACKBOX, converter=probability)
FEATURE_TYPE_MAPPING += features_timerange('su_is_mobile_prob', DEFAULT_TIMERANGES_BLACKBOX, converter=probability)
FEATURE_TYPE_MAPPING += features_timerange('su_browser_os_prob', DEFAULT_TIMERANGES_BLACKBOX, converter=probability)


def prepare_row_for_tensornet(row, query_id):
    features = [
        query_id,                                 # query_id
        row['target'],                            # target
        '%s-%s-%s' % (row['track_id'], row['uid'], row['unixtime']),  # url
        '',                                       # host
    ]
    features += [feature_converter(row[feature_name]) if feature_converter is not None else row[feature_name]
                 for feature_name, feature_type, feature_converter in FEATURE_TYPE_MAPPING]
    return '\t'.join(map(str, features))


def get_columns():
    columns = [
        'query_id',
        'target',
        'url',
        'host',
    ]
    columns += map(lambda mapping: mapping[0], FEATURE_TYPE_MAPPING)
    return columns


def build_fd_files(base_fd_filename):
    with open(base_fd_filename + '.fd', 'w') as fd_file, open(base_fd_filename + '.num.fd', 'w') as fd_num_file, open(base_fd_filename + '.categ.fd', 'w') as fd_categ_file:
        num_count = 0
        categ_count = 0
        for i, (feature_name, feature_type, _) in enumerate(FEATURE_TYPE_MAPPING):
            if feature_type in [None, 'num']:
                fd_file.write('%s\t%s\n' % (i, 'num'))
                fd_num_file.write('%s\t%s\t%s\n' % (num_count, 'num', feature_name))
                num_count += 1
            elif feature_type == 'categ':
                fd_file.write('%s\t%s\n' % (i, 'categ'))
                fd_categ_file.write('%s\t%s\t%s\n' % (categ_count, 'num', feature_name))
                categ_count += 1
            else:
                raise ValueError('Unsupported feature type: %s' % feature_type)


def prepare_and_split_dataset_for_tensornet(
    config,
    date_start,
    date_end,
    class_balance,
    output_dir,
    output_filename,
    tmp_dir,
    dataset_dir='//home/passport/dataset',
    skip_target_count=False,
):
    yt_client = get_yt(config)

    yt_output_file = prepare_balanced_dataset(
        date_start=date_start,
        date_end=date_end,
        class_balance=class_balance,
        input_dir=dataset_dir,
        output_dir=output_dir,
        tmp_dir=tmp_dir,
        yt_client=yt_client,
        skip_count=skip_target_count,
    )

    columns = get_columns()
    with open(output_filename, 'w') as output_file:
        output_file.write('\t'.join(columns) + '\n')
        for i, row in enumerate(yt.read_table(yt_output_file, format='yson', raw=False, client=yt_client)):
            output_file.write(prepare_row_for_tensornet(row, i) + '\n')
            if i % SHOW_PROCESS_AFTER_LINES == 0:
                print 'processed %s lines' % i  # noqa

    build_fd_files(output_filename)


def prepare_dataset_for_tensornet(
    config,
    date_start,
    date_end,
    limit,
    output_filename,
    dataset_dir='//home/passport/dataset',
    from_tmp=False,
):
    date_start_str = to_date_str(date_start)
    date_start_end = to_date_str(date_end)
    yt_output_file = '//tmp/passport-dataset/dataset-%s--%s' % (date_start_str, date_start_end)
    records_count = 0

    yt_client = get_yt(config)

    if not from_tmp:
        for date in rrule.rrule(rrule.DAILY, dtstart=date_start, until=date_end):
            records_count += yt.row_count(os.path.join(dataset_dir, to_date_str(date)), client=yt_client)

        job = get_cluster(config).job()
        table = job.table(os.path.join(dataset_dir, '{%s..%s}' % (date_start_str, date_start_end)))
        table.random(fraction=float(limit) / records_count).put(yt_output_file)
        job.run()

    columns = get_columns()
    with open(output_filename, 'w+') as output_file:
        output_file.write('\t'.join(columns) + '\n')
        for i, row in enumerate(yt.read_table(yt_output_file, format='yson', raw=False, client=yt_client)):
            output_file.write(prepare_row_for_tensornet(row, i) + '\n')
            if i % SHOW_PROCESS_AFTER_LINES == 0:
                print 'processed %s lines' % i  # noqa

    build_fd_files(output_filename)
