import re
import urlparse
from functools import partial

import luigi
import yt.wrapper as yt

from data_imports.day_aggregate import reduce_yuid_log_events_day, reduce_device_log_events_day, \
    reduce_change_devid, finalize_yuid_with_x_day_tables, finalize_device_yuid_day_tables
from lib.luigi import yt_luigi
from rtcconf import config
from utils import mr_utils as mr
from utils import uat_utils
from utils import utils
from v2.soup import soup_config
from v2.soup.soup_tables import SoupDailyLogTable


def extract_uuid(referer):
    idx = referer.find('uuid=')
    if idx > -1:
        uuid = referer[idx + 5:]
        idx = uuid.find('&')
        if idx > -1:
            return uuid[:idx]
        else:
            return uuid.strip()
    return ''


yandexuid_regex = re.compile(r'\byandexuid=(\d+)')

def extract_yuid(body):
    if 'yandexuid=' in body:
        m = yandexuid_regex.search(body)
        if m:
            return m.group(1)
        else:
            return ''
    else:
        return ''


def extract_yp(body):
    if 'yp=' in body:
        return body.split('yp=')[1].split(';', 1)[0]
    else:
        return ''


def extract_yu(body):
    if '.yu.' in body:
        yuid = body.split('.yu.')[1].split('.', 1)[0]
        if '#' in yuid:
            return yuid.split('#')[0]
        return yuid
    else:
        return ''


def map_uuid(yuid, rec):
    request = rec.get('request', '')
    if request.startswith('/searchapp/jl'):
        uuid = extract_uuid(request)
        if uuid:
            ts = (rec.get('req_id', None) and int(rec['req_id'][:10]) or
                  rec.get('request_time', None) and int(rec['request_time'].split('.')[0]) or
                  0)
            yield {'uuid': uuid,
                   'yuid': yuid,
                   'ts': -ts,
                   'source': config.ID_SOURCE_TYPE_STARTUP}


def map_kinopoisk(yuid, rec):
    host = rec.get('canonized_vhost')
    cookies = rec.get('cookies')
    if host and 'kinopoisk' in host and cookies and cookies != '-':
        kp_uid = mr.get_field_value('uid', cookies, separator='; ')
        try:
            int(kp_uid)
        except ValueError:
            return
        if kp_uid:
            yield {"id_value": kp_uid, "yuid": yuid}


def map_startup(yuid, rec):
    request = rec.get('request', '')
    if 'uuid=' in request:
        uuid = urlparse.parse_qs(urlparse.urlsplit(request).query).get('uuid', [''])[0]
        ts = utils.get_ts(rec.get('iso_eventtime', ''), rec.get('timezone', ''))
        if uuid and ts:
            yield {'uuid': uuid,
                   'yuid': yuid,
                   'ts': -ts,
                   'source': config.ID_SOURCE_TYPE_ACCESS_LOG}


def map_ips(yuid, rec):
    ip = rec.get('ip')
    if ip:
        ip = ip.replace('::ffff:', '')
        if ip and ip != '127.0.0.1' and ip != '0.0.0.0':
            if ':' in ip:
                ipv6 = True
            else:
                ipv6 = False
            yield {'yuid': yuid, 'id_value': ip, 'ipv6': ipv6}


def map_user_agent(yuid, rec):
    ua = rec.get('user_agent')
    if ua:
        yield {'yuid': yuid, 'id_value': ua}


did_regex = re.compile(r'.did.(?P<did>[a-zA-Z0-9-]*)#.*')
def map_did(yuid, rec):
    if not rec.get('cookies'):
        return
    cookies = rec['cookies']
    yp = mr.get_field_value("yp", cookies, separator="; ")
    if not yp:
        return
    did = did_regex.search(yp)
    if not did:
        return
    did = did.group(1)
    os_family = uat_utils.Ua(rec.get('user_agent')).profile_info.get('OSFamily')
    unixtime = rec.get('unixtime', '')
    try:
        ts = int(rec.get('unixtime', ''))
    except ValueError:
        return
    yield {config.ID_TYPE_YUID: yuid,
           config.ID_TYPE_MMETRIC_DEVID: did,
           'ts': ts,
           'os': os_family
           }


def map_os(rec):
    if rec['os'] == 'iOS':
        rec['@table_index'] = 0
    else:
        rec['@table_index'] = 1
    yield rec


transformation_dict = {
    map_uuid: (lambda rec, index_: SoupDailyLogTable.make_rec(rec['yuid'], rec['uuid'], soup_config.yuid_uuid_al , ts=rec['ts'], table_index=index_+1)),
    map_kinopoisk: (lambda rec, index_: SoupDailyLogTable.make_rec(rec['yuid'], rec['id_value'], soup_config.yuid_kpid_al , table_index=index_+1)),
    map_startup: (lambda rec, index_: SoupDailyLogTable.make_rec(rec['yuid'], rec['uuid'], soup_config.yuid_uuid_sal , ts=rec['ts'], table_index=index_+1)),
    map_ips:  None,  # nothing to do, we dont work with IPs
    map_user_agent:  None,  # nothing to do, we dont work with User-Agents
    map_did: (lambda rec, index_: SoupDailyLogTable.make_rec(rec['yuid'], rec['mmetric_devid'], soup_config.yuid_devid_yp_al, ts=rec['ts'], table_index=index_+1))
}


def map_access(rec, mappers):
    yuid = extract_yuid(rec.get('cookies', ''))
    user_agent = rec.get('user_agent')
    if yuid and user_agent:

        try:
            int(yuid)
        except ValueError:
            return

        ua = uat_utils.Ua(user_agent)
        if ua.is_bad():
            rec['@table_index'] = len(mappers)  # bad ua
            yield rec
            return

        for table_index, mapper in enumerate(mappers):
            for out_rec in mapper(yuid, rec):
                out_rec['@table_index'] = table_index
                yield out_rec
                target = transformation_dict[mapper]
                if target:
                    yield target(out_rec, len(mappers))


def join_kinopoisk_email(kp_uid_key, recs):
    try:
        yuid_raw_recs, kp_email_recs = mr.split_left_right(recs)
    except mr.OomLimitException as oom:
        yield {'kp_yuid': kp_uid_key['id_value'], 'oom': oom.recs_count, '@table_index': 1}
        return

    for yuid_raw in yuid_raw_recs:
        for kp_email in kp_email_recs:
            # change kp uid to email
            yuid_raw[config.ID_TYPE_KINOPOISK_UID] = kp_email['id_value']
            yuid_raw['id_value'] = kp_email['email']
            yuid_raw['id_type'] = config.ID_TYPE_EMAIL
            yield yuid_raw


def reduce_over_limit(key, recs, field, limit):
    field_values = set()
    for rec in recs:
        field_values.add(rec[field])
        if len(field_values) > limit:
            res = dict(key)
            res['@table_index'] = 1
            yield res
            return

    for field_value in field_values:
        res = dict(key)
        res[field] = field_value
        yield res


class ImportAccessLogsDayTask(yt_luigi.BaseYtTask):
    date = luigi.Parameter()
    run_date = luigi.Parameter()

    resources = {'import_access_log_lock': 1}

    def __init__(self, *args, **kwargs):
        super(ImportAccessLogsDayTask, self).__init__(*args, **kwargs)
        self.soup_log = SoupDailyLogTable(soup_config.LOG_SOURCE_ACCESS_LOG, self.date)

    def input_folders(self):
        return {
            'statbox': config.STATBOX_ACCESS_FOLDER,
            'dict': config.GRAPH_YT_DICTS_FOLDER
        }

    def output_folders(self):
        return {
            'devid_raw': config.INDEVICE_YT_FOLDER + self.date + '/perfect/devid_raw_day/',
            'raw_links': config.YT_OUTPUT_FOLDER + self.date + '/raw_links/',
            'yuid_raw': config.YT_OUTPUT_FOLDER + self.date + '/yuid_raw/',
            'kinopoisk': config.YT_OUTPUT_FOLDER + self.date + '/kinopoisk/',
            'logs': config.INDEVICE_YT_FOLDER + self.date + '/perfect/logs/',
        }

    def requires(self):
        return [yt_luigi.ExternalInput(self.in_f('statbox') + self.date),
                yt_luigi.ExternalInput(self.in_f('dict') + 'kinopoisk')]

    def run(self):

        mr.mkdir(self.out_f('devid_raw'))
        mr.mkdir(self.out_f('yuid_raw'))
        mr.mkdir(self.out_f('raw_links'))
        mr.mkdir(self.out_f('kinopoisk'))
        mr.mkdir(self.out_f('logs'))
        self.soup_log.ensure_dir()

        # import log
        yt.run_map(partial(map_access, mappers=[map_uuid, map_kinopoisk, map_startup, map_ips, map_user_agent, map_did]),
                   self.in_f('statbox') + self.date,
                   [self.out_f('logs') + 'uuid_yuid_access',
                    self.out_f('kinopoisk') + 'yuid_uid_kinopoisk',
                    self.out_f('logs') + 'uuid_yuid_startup',
                    self.out_f('raw_links') + 'yuid_ip_access_tmp',
                    self.out_f('raw_links') + 'yuid_ua_access_tmp',
                    self.out_f('devid_raw') + 'devid_yuid_allos_access_tmp',
                    self.out_f('raw_links') + 'bad_ua_access',
                    self.soup_log.create(),
                    ],
                   spec=mr.DATA_SIZE_PER_JOB_2GB_SPEC)

        self.soup_log.prepare_daily_tables_from_log()

        mr.sort_all([self.out_f('logs') + 'uuid_yuid_access', self.out_f('logs') + 'uuid_yuid_startup'],
                    sort_by=[config.ID_TYPE_UUID, config.ID_TYPE_YUID, 'ts'])

        kp_uid_out_table = self.out_f('yuid_raw') + 'yuid_with_' + \
                           config.ID_TYPE_KINOPOISK_UID + '_' + config.ID_SOURCE_TYPE_KINOPOISK
        kp_email_out_table = self.out_f('yuid_raw') + 'yuid_with_' + \
                             config.ID_TYPE_EMAIL + '_' + config.ID_SOURCE_TYPE_KINOPOISK
        yuid_with_ip_access_log = self.out_f('yuid_raw') + 'yuid_with_ip_access_log'
        yuid_with_ua_access_log = self.out_f('yuid_raw') + 'yuid_with_ua_access_log'

        uuid_yuid_access = self.out_f('devid_raw') + 'uuid_yuid_' + config.ID_SOURCE_TYPE_ACCESS_LOG
        devid_yuid_yp_ios = self.out_f('devid_raw') + 'devid_yuid_' + config.ID_SOURCE_TYPE_ACCESS_YP_IOS
        devid_yuid_yp_android = self.out_f('devid_raw') + 'devid_yuid_' + config.ID_SOURCE_TYPE_ACCESS_YP_ANDROID
        uuid_yuid_startup = self.out_f('devid_raw') + 'uuid_yuid_' + config.ID_SOURCE_TYPE_STARTUP

        # day tables
        utils.wait_all([
            yt.run_reduce(partial(reduce_device_log_events_day, dt=self.date,
                                  source_type=config.ID_SOURCE_TYPE_ACCESS_LOG),
                          self.out_f('logs') + 'uuid_yuid_access',
                          uuid_yuid_access,
                          sort_by=[config.ID_TYPE_UUID, config.ID_TYPE_YUID, 'ts'],
                          reduce_by=[config.ID_TYPE_UUID, config.ID_TYPE_YUID], sync=False),
            yt.run_reduce(partial(reduce_device_log_events_day, dt=self.date,
                                  source_type=config.ID_SOURCE_TYPE_STARTUP),
                          self.out_f('logs') + 'uuid_yuid_startup',
                          uuid_yuid_startup,
                          sort_by=[config.ID_TYPE_UUID, config.ID_TYPE_YUID, 'ts'],
                          reduce_by=[config.ID_TYPE_UUID, config.ID_TYPE_YUID], sync=False),
            yt.run_map_reduce(None, partial(reduce_yuid_log_events_day, dt=self.date,
                                            id_type=config.ID_TYPE_KINOPOISK_UID,
                                            source_type=config.ID_SOURCE_TYPE_KINOPOISK),
                              self.out_f('kinopoisk') + 'yuid_uid_kinopoisk',
                              kp_uid_out_table,
                              reduce_by=config.ID_TYPE_YUID, sync=False),
            yt.run_map_reduce(None, partial(reduce_yuid_log_events_day, dt=self.date,
                                            id_type=config.ID_TYPE_IP,
                                            source_type=config.ID_SOURCE_TYPE_ACCESS_LOG),
                              self.out_f('raw_links') + 'yuid_ip_access_tmp',
                              yuid_with_ip_access_log,
                              reduce_by=config.ID_TYPE_YUID, sync=False),
            yt.run_map_reduce(None, partial(reduce_yuid_log_events_day, dt=self.date,
                                            id_type=config.FIELD_UA,
                                            source_type=config.ID_SOURCE_TYPE_ACCESS_LOG),
                              self.out_f('raw_links') + 'yuid_ua_access_tmp',
                              yuid_with_ua_access_log,
                              reduce_by=config.ID_TYPE_YUID, sync=False)
        ])

        # enrich kp with email
        yt.run_sort(kp_uid_out_table, sort_by='id_value')
        yt.run_reduce(join_kinopoisk_email,
                      [kp_uid_out_table, self.in_f('dict') + 'kinopoisk'],
                      [kp_email_out_table, self.out_f('kinopoisk') + 'email_join_oom'],
                      reduce_by='id_value')

        finalize_yuid_with_x_day_tables([
            kp_uid_out_table,
            kp_email_out_table,
            yuid_with_ip_access_log,
            yuid_with_ua_access_log,
        ])


        mr.drop(self.out_f('logs') + 'uuid_yuid_access')
        mr.drop(self.out_f('logs') + 'uuid_yuid_startup')
        mr.drop(self.out_f('raw_links') + 'yuid_ip_access_tmp')
        mr.drop(self.out_f('raw_links') + 'yuid_ua_access_tmp')

        #mmetrik_devid to crypta devid
        yt.run_map(map_os,
                   self.out_f('devid_raw') + 'devid_yuid_allos_access_tmp',
                   [devid_yuid_yp_ios + '_tmp',
                    devid_yuid_yp_android + '_tmp']
                   )
        utils.wait_all([
            yt.run_sort(devid_yuid_yp_ios + '_tmp',
                        sort_by=[config.ID_TYPE_YUID, config.ID_TYPE_MMETRIC_DEVID, 'ts']),
            yt.run_sort(devid_yuid_yp_android + '_tmp',
                        sort_by=[config.ID_TYPE_YUID, config.ID_TYPE_MMETRIC_DEVID, 'ts'])
        ])
        utils.wait_all([
            yt.run_reduce(partial(reduce_device_log_events_day, dt=self.date,
                                      source_type=config.ID_SOURCE_TYPE_ACCESS_YP_IOS),
                          devid_yuid_yp_ios + '_tmp',
                          devid_yuid_yp_ios + '_tmp2',
                          sort_by=[config.ID_TYPE_YUID, config.ID_TYPE_MMETRIC_DEVID, 'ts'],
                          reduce_by=[config.ID_TYPE_YUID, config.ID_TYPE_MMETRIC_DEVID], sync=False),
            yt.run_reduce(partial(reduce_device_log_events_day, dt=self.date,
                                      source_type=config.ID_SOURCE_TYPE_ACCESS_YP_ANDROID),
                          devid_yuid_yp_android + '_tmp',
                          devid_yuid_yp_android + '_tmp2',
                          sort_by=[config.ID_TYPE_YUID, config.ID_TYPE_MMETRIC_DEVID, 'ts'],
                          reduce_by=[config.ID_TYPE_YUID, config.ID_TYPE_MMETRIC_DEVID], sync=False)
        ])
        utils.wait_all([
            yt.run_sort(self.in_f('dict') + 'devid_hash', self.out_f('devid_raw') + 'devid_hash',
                        sort_by=config.ID_TYPE_MMETRIC_DEVID, sync=False),
            yt.run_sort(devid_yuid_yp_ios + '_tmp2',
                        sort_by=config.ID_TYPE_MMETRIC_DEVID, sync=False),
            yt.run_sort(devid_yuid_yp_android + '_tmp2',
                        sort_by=config.ID_TYPE_MMETRIC_DEVID, sync=False)
        ])
        utils.wait_all([
            yt.run_reduce(reduce_change_devid, [self.out_f('devid_raw') + 'devid_hash',
                                                devid_yuid_yp_ios + '_tmp2'],
                          [devid_yuid_yp_ios,
                           devid_yuid_yp_ios + '_nodevid'],
                          reduce_by=config.ID_TYPE_MMETRIC_DEVID),
            yt.run_reduce(reduce_change_devid, [self.out_f('devid_raw') + 'devid_hash',
                                                devid_yuid_yp_android + '_tmp2'],
                          [devid_yuid_yp_android,
                           devid_yuid_yp_android + '_nodevid'],
                          reduce_by=config.ID_TYPE_MMETRIC_DEVID)
        ])

        finalize_device_yuid_day_tables([
            uuid_yuid_access,
            devid_yuid_yp_ios,
            devid_yuid_yp_android,
            uuid_yuid_startup
        ])

        mr.drop(self.out_f('devid_raw') + 'devid_yuid_allos_access_tmp')
        mr.drop(devid_yuid_yp_android + '_tmp')
        mr.drop(devid_yuid_yp_android + '_tmp2')
        mr.drop(devid_yuid_yp_ios + '_tmp')
        mr.drop(devid_yuid_yp_ios + '_tmp2')
        mr.drop(self.out_f('devid_raw') + 'devid_hash')

    def output(self):
        if self.date == self.run_date:
            soup_out_tables = self.soup_log.daily_tables_targets()
        else:
            soup_out_tables = []

        kp_uid_out_table = self.out_f('yuid_raw') + 'yuid_with_' + \
                           config.ID_TYPE_KINOPOISK_UID + '_' + config.ID_SOURCE_TYPE_KINOPOISK
        kp_email_out_table = self.out_f('yuid_raw') + 'yuid_with_' + \
                             config.ID_TYPE_EMAIL + '_' + config.ID_SOURCE_TYPE_KINOPOISK
        yuid_ip_table = self.out_f('yuid_raw') + 'yuid_with_' + \
                        config.ID_TYPE_IP + '_' + config.ID_SOURCE_TYPE_ACCESS_LOG
        yuid_ua_table = self.out_f('yuid_raw') + 'yuid_with_' + \
                        config.FIELD_UA + '_' + config.ID_SOURCE_TYPE_ACCESS_LOG
        return [yt_luigi.YtTarget(t) for t in [self.out_f('devid_raw') + 'uuid_yuid_' + config.ID_SOURCE_TYPE_ACCESS_LOG,
                                               self.out_f('devid_raw') + 'uuid_yuid_' + config.ID_SOURCE_TYPE_STARTUP,
                                               kp_uid_out_table,
                                               kp_email_out_table,
                                               yuid_ip_table,
                                               yuid_ua_table]] + soup_out_tables


def upload_kinopoisk():

    def read_dump(dump_path):
        with open(dump_path) as f:
            header = True
            bad_email_count = 0
            for idx, line in enumerate(f):
                if header:
                    header = False
                    continue

                if idx % 10000 == 0:
                    print(idx)

                splitted = line.rstrip('\n').split(',')
                kp_uid = splitted[0]
                email_part = splitted[1:]

                if len(email_part) > 1:
                    bad_email_count += 1
                    print line, bad_email_count
                    continue

                email = ''.join(email_part)
                email = email.strip('"')
                yield {'kp_uid': int(kp_uid), 'id_value': kp_uid, 'email': email}

    dump = read_dump('/home/artembelov/kp/uid-email.csv')
    yt.write_table(config.GRAPH_YT_DICTS_FOLDER + 'kinopoisk', dump, raw=False)

    mr.distinct_by(['kp_uid', 'id_value', 'email'],
                   config.GRAPH_YT_DICTS_FOLDER + 'kinopoisk',
                   config.GRAPH_YT_DICTS_FOLDER + 'kinopoisk')

    yt.run_sort(config.GRAPH_YT_DICTS_FOLDER + 'kinopoisk', sort_by='id_value')


if __name__ == '__main__':
    yt.config.set_proxy(config.MR_SERVER)

    # dt = '2016-08-23'
    # import smart_runner
    # smart_runner.run_isolated('//home/crypta/team/rodion/CRYPTAIS-996/', dt, ImportAccessLogsDayTask,
    #                           ImportAccessLogsDayTask, date=dt, run_date=dt)

    workdir = '//home/crypta/team/artembelov/access-log/'

    yt.run_map(partial(map_access, mappers=[map_uuid, map_kinopoisk, map_startup, map_ips, map_user_agent]),
               '//statbox/access-log/2017-01-09',
               [workdir + 'uuid_yuid_access',
                workdir + 'yuid_uid_kinopoisk',
                workdir + 'uuid_yuid_startup',
                workdir + 'yuid_ip_access_tmp',
                workdir + 'yuid_ua_access_tmp',
                workdir + 'bad_ua_access'],
               spec=mr.DATA_SIZE_PER_JOB_2GB_SPEC)
