import logging
import os
from collections import defaultdict
from datetime import datetime as datetime_datetime
from functools import partial
from itertools import groupby
from urlparse import urlparse

import luigi
import yt.wrapper as yt

from data_imports.import_logs import graph_import_fp as fp
from lib.luigi import yt_luigi
from rtcconf import config
from utils import mr_utils as mr
from utils import utils
from v2.soup import soup_config
from v2.soup.soup_tables import SoupDailyLogTable


def group_ts_by_date(tss):
    grouped_by_date = defaultdict(list)
    for ts in tss:
        date = utils.ts_to_date_str(int(ts))
        grouped_by_date[date].append(ts)
    return grouped_by_date.iteritems()


def is_bad_value(id_value, id_type):
    if id_value and id_type == config.ID_TYPE_PHONE:
        number_of_repeating_symbols = max([sum(1 for _ in group) for label, group in groupby(id_value)])
        if number_of_repeating_symbols > 4:
            return True
    elif id_value and id_type == config.ID_TYPE_DATE:
        day, month, year = map(int, id_value.split('.'))
        try:
            date = datetime_datetime(year, month, day)
            if not config.WEBVISOR_MINYEAR <= year <= config.WEBVISOR_MAXYEAR:
                return True
        except ValueError:
            return True

    return False


def map_webvisor(rec, date):
    yuid = rec['key']
    url = mr.get_field_value('url', rec['value'])
    try:
        ts = int(rec['subkey'])
    except ValueError:
        return

    def mkrec(id_value, id_type, field_name, field_id, field_tag):
        out_rec = fp.mkrec(yuid, id_value, id_type, config.ID_SOURCE_TYPE_WEBVISOR, date, 1)
        out_rec['url'] = url
        out_rec['ts'] = ts
        out_rec['field_name'] = field_name
        out_rec['field_id'] = field_id
        out_rec['field_tag'] = field_tag
        out_rec['bad_value'] = is_bad_value(id_value, id_type)
        if id_type == config.ID_TYPE_EMAIL or id_type == config.ID_TYPE_PHONE:
            out_rec['@table_index'] = 0
            return map_login(out_rec)
        elif id_type == config.ID_TYPE_DATE:
            out_rec['@table_index'] = 1
            return map_date(out_rec)

    email = mr.get_field_value(config.ID_TYPE_EMAIL, rec['value'])
    phone = mr.get_field_value(config.ID_TYPE_PHONE, rec['value'])
    birth_date = mr.get_field_value(config.ID_TYPE_DATE, rec['value'])

    if email and not email.endswith('yandex-team.ru'):
        field_name = mr.get_field_value('email_name', rec['value'])
        field_id = mr.get_field_value('email_id', rec['value'])
        field_tag = mr.get_field_value('email_tag', rec['value'])
        yield mkrec(email, config.ID_TYPE_EMAIL, field_name, field_id, field_tag)

    if phone:
        field_name = mr.get_field_value('phone_name', rec['value'])
        field_id = mr.get_field_value('phone_id', rec['value'])
        field_tag = mr.get_field_value('phone_tag', rec['value'])
        yield mkrec(phone, config.ID_TYPE_PHONE, field_name, field_id, field_tag)

    if birth_date:
        field_name = mr.get_field_value('date_name', rec['value'])
        field_id = mr.get_field_value('date_id', rec['value'])
        field_tag = mr.get_field_value('date_tag', rec['value'])
        yield mkrec(birth_date, config.ID_TYPE_DATE, field_name, field_id, field_tag)


def process_fresh(in_fresh_f, webvisor_processed_f, webvisor_date_processed_f):
    # [:-1] cause last tables is locked by stbx-parser and process will fail if you try to do anything with it
    ts_tables_need_processing = sorted(mr.ls(in_fresh_f, absolute_path=False))[:-1]

    out_tables = []

    for date, ts_tables in group_ts_by_date(ts_tables_need_processing):
        # max 30 tables to single map due to performance reasons
        chunks = [ts_tables[i:i + 30] for i in range(0, len(ts_tables), 30)]
        out_table_login = os.path.join(webvisor_processed_f, date)
        out_table_date = os.path.join(webvisor_date_processed_f, date)

        for chunk in chunks:
            chunk_tables = [os.path.join(in_fresh_f, part) for part in chunk]
            yt.run_map(partial(map_webvisor, date=date), chunk_tables,
                       [yt.TablePath(out_table_login, append=True),
                        yt.TablePath(out_table_date, append=True)])

        out_tables.append(out_table_login)
        out_tables.append(out_table_date)

    mr.sort_all(out_tables, sort_by=config.ID_TYPE_YUID)

    for t in ts_tables_need_processing:
        mr.drop(os.path.join(in_fresh_f, t))


def map_date(rec):
    # process url
    url_parsed = urlparse(rec['url'])
    url_domain = url_parsed.hostname

    field_date_tokens = ['birth', 'born', 'bday']
    found_field_tokens = set()

    # process input field attrs
    field_attrs = []

    field_id = rec.get('field_id')
    if field_id:
        field_attrs.append(field_id.lower())

    field_name = rec.get('field_name')
    if field_name:
        field_attrs.append(field_name.lower())

    for field_attr in field_attrs:
        for token in field_date_tokens:
            if token in field_attr:
                found_field_tokens.add(token)

    is_birth_date = bool(found_field_tokens and not rec['bad_value'])

    rec['domain'] = url_domain
    rec['is_birth_date'] = is_birth_date
    rec['field_date_tokens'] = list(found_field_tokens)
    return rec


def map_login(rec):
    # process url
    url_parsed = urlparse(rec['url'])
    url_domain = url_parsed.hostname
    url_part = url_parsed.path + url_parsed.query

    url_login_tokens = ['login', 'logon', 'account', 'user',
                        'auth', 'sign', 'ident', 'regist',
                        'voyti', 'vojti',
                        'restore', 'recovery', 'forgot', 'reset', 'password']
    field_login_tokens = url_login_tokens + ['reg', 'log', 'lgn']

    found_url_tokens = set()
    found_field_tokens = set()

    # regex can be used to speed up
    for token in url_login_tokens:
        if token in url_part:
            found_url_tokens.add(token)

    # process input field attrs
    field_attrs = []

    field_id = rec.get('field_id')
    if field_id:
        field_attrs.append(field_id.lower())

    field_name = rec.get('field_name')
    if field_name:
        field_attrs.append(field_name.lower())

    for field_attr in field_attrs:
        for token in field_login_tokens:
            if token in field_attr:
                found_field_tokens.add(token)

    is_login = bool(found_url_tokens or found_field_tokens)

    rec['domain'] = url_domain
    rec['is_login'] = is_login
    rec['url_login_tokens'] = list(found_url_tokens)
    rec['field_login_tokens'] = list(found_field_tokens)

    return rec

    # if is_login:
    #     rec['@table_index'] = 0
    #     rec['url_login_tokens'] = list(found_url_tokens)
    #     rec['field_login_tokens'] = list(found_field_tokens)
    # else:
    #     rec['@table_index'] = 1
    # yield rec


class ImportWebvisorFromStbxTask(yt_luigi.BaseYtTask):
    date = luigi.Parameter()

    def requires(self):
        return []

    def input_folders(self):
        return {
            'webvisor': config.FRESH_STBX_WEBVISOR_FOLDER.rstrip('/')
        }

    def output_folders(self):
        return {
            'webvisor_copy': config.FRESH_WEBVISOR_FOLDER.rstrip('/'),
            'webvisor_processed': config.WEBVISOR_LOGIN_FOLDER_PROCESSED.rstrip('/'),
            'webvisor_done_flag': "/".join(
                config.WEBVISOR_LOGIN_FOLDER_PROCESSED.rstrip('/').split('/')[:-1]
                + ['_{}_webvisor_import_done'.format(self.date)]
            ),
            'webvisor_date_processed': config.WEBVISOR_DATE_FOLDER_PROCESSED.rstrip('/')
        }

    def run(self):
        # fresh tables
        fresh_tables = mr.ls(config.FRESH_STBX_WEBVISOR_FOLDER.rstrip('/'), absolute_path=False)
        logging.info('Found fresh tables: %s' % fresh_tables)
        # now we are missing one trace os successful execution
        mr.drop(self.out_f('webvisor_done_flag'))

        mr.mkdir(self.out_f('webvisor_copy'))
        mr.mkdir(self.out_f('webvisor_processed'))
        mr.mkdir(self.out_f('webvisor_date_processed'))

        # Make shadow copy to webvisor_copy
        for t in sorted(fresh_tables):
            in_table = '/'.join([self.in_f('webvisor'), t])
            copy_table = '/'.join([self.out_f('webvisor_copy'), t])

            yt.copy(in_table, copy_table, force=True)
            yt.remove(in_table, force=True)

        # Main processing
        process_fresh(
            self.out_f('webvisor_copy'),
            self.out_f('webvisor_processed'),
            self.out_f('webvisor_date_processed')
        )
        # create the last required trace of successful execution
        yt.create_table(self.out_f('webvisor_done_flag'), ignore_existing=True)

    def output(self):
        for table in [
            yt_luigi.YtFolderTarget(self.out_f('webvisor_processed'), allow_empty=False),
            yt_luigi.YtFolderTarget(self.out_f('webvisor_date_processed'), allow_empty=False),
            yt_luigi.YtFolderTarget(self.out_f('webvisor_copy'), allow_empty=False),
            yt_luigi.YtTarget(self.out_f('webvisor_done_flag'), allow_empty=True)
        ]:
            yield table


def filter_good_webvisor(rec):
    if utils.is_true(rec.get('bad_value')) or \
            utils.is_false(rec['is_login']) or \
            (rec.get('field_tag') and rec['field_tag'].upper() != 'INPUT'):
        return
    else:
        yield rec


def split_webvisor(rec):
    if rec['id_type'] == config.ID_TYPE_EMAIL:
        rec['@table_index'] = 0
        yield SoupDailyLogTable.make_rec(
            rec['yuid'],
            utils.norm_email(rec['id_value']),
            soup_config.yuid_email_webvisor,
            table_index=2
        )
    else:
        rec['@table_index'] = 1
        yield SoupDailyLogTable.make_rec(
            rec['yuid'],
            rec['id_value'][1:],
            soup_config.yuid_phone_webvisor,
            table_index=2
        )
    yield rec


class ImportWebvisorTask(yt_luigi.BaseYtTask):
    date = luigi.Parameter()
    run_date = luigi.Parameter()

    def __init__(self, *args, **kwargs):
        super(ImportWebvisorTask, self).__init__(*args, **kwargs)
        self.soup_log = SoupDailyLogTable(soup_config.LOG_SOURCE_WEBVISOR, self.date)

    def requires(self):
        return [ImportWebvisorFromStbxTask(self.run_date)]

    def run(self):
        from data_imports.import_logs import graph_import_fp
        out = config.YT_OUTPUT_FOLDER + self.date + '/'
        self.soup_log.ensure_dir()

        date = self.date
        yt.run_map_reduce(
            filter_good_webvisor,
            graph_import_fp.reduce_yuid_ids,
            config.WEBVISOR_LOGIN_FOLDER_PROCESSED + date,
            out + 'yuid_raw/yuid_with_' + config.ID_SOURCE_TYPE_WEBVISOR,
            reduce_by='yuid'
        )
        yt.run_map(
            split_webvisor,
            out + 'yuid_raw/yuid_with_' + config.ID_SOURCE_TYPE_WEBVISOR,
            [
                out + 'yuid_raw/yuid_with_' + config.ID_TYPE_EMAIL + '_' + config.ID_SOURCE_TYPE_WEBVISOR,
                out + 'yuid_raw/yuid_with_' + config.ID_TYPE_PHONE + '_' + config.ID_SOURCE_TYPE_WEBVISOR,
                self.soup_log.create()
            ]
        )
        self.soup_log.prepare_daily_tables_from_log()

        utils.wait_all([
            yt.run_sort(
                out + 'yuid_raw/yuid_with_' + config.ID_TYPE_EMAIL + '_' + config.ID_SOURCE_TYPE_WEBVISOR,
                sort_by='yuid',
                sync=False
            ),
            yt.run_sort(
                out + 'yuid_raw/yuid_with_' + config.ID_TYPE_PHONE + '_' + config.ID_SOURCE_TYPE_WEBVISOR,
                sort_by='yuid',
                sync=False
            ),
        ])

    def output(self):

        if self.date == self.run_date:
            soup_out_tables = self.soup_log.daily_tables_targets()
        else:
            soup_out_tables = []

        out_folder = config.YT_OUTPUT_FOLDER + self.date + '/'
        return [
                   yt_luigi.YtTarget(
                       out_folder + 'yuid_raw/yuid_with_' + config.ID_TYPE_EMAIL + '_' + config.ID_SOURCE_TYPE_WEBVISOR),
                   yt_luigi.YtTarget(
                       out_folder + 'yuid_raw/yuid_with_' + config.ID_TYPE_PHONE + '_' + config.ID_SOURCE_TYPE_WEBVISOR),
               ] + soup_out_tables

