#!/usr/bin/python -tt
# coding=utf-8
import sys

import luigi
import yt.wrapper as yt

from data_imports.import_logs import graph_import_fp, app_metrica_day
from data_imports.import_logs.webvisor import graph_webvisor
from lib.luigi import base_luigi_task
from lib.luigi import yt_luigi
from matching.device_matching.fuzzy_heuristic.paths import (
    get_graph_table, get_table_name, ip_bad, ip_actions_nobad, ip_intervals, device_yuid_fuzzy_pairs)

from rtcconf import config
from utils import mr_utils as mr
from v2 import ids

__author__ = 'rodion'

big_ts = 9000000000


def reduce_ip_actions(_, recs):
    # records will be sorted, first will the one with subkey 'a' only after with subkey 'b'
    deviceid = ''
    ua_profile = ''
    for rec in recs:
        if not deviceid:
            deviceid = mr.get_field_value('devid', rec['value'])
            ua_profile = mr.get_field_value('ua_profile', rec['value'])
            if not deviceid or not ua_profile:
                return

        ip = mr.get_field_value('ip', rec['value'])
        ts = mr.get_field_value('ts', rec['value'])

        yield dict(key=ip, subkey='b', value='devid=' + deviceid + '\tts=' + str(ts)
                                             + '\tua_profile=' + ua_profile)


def reduce_ip_bad(ip_key, recs):
    devids = set()
    for rec in recs:
        devid = rec[ids.CRYPTA_DEVICE_ID]
        devids.add(devid)
        if len(devids) >= 30:
            yield dict(ip=ip_key['ip'], is_bad=True)
            return


def reduce_ip_actions_nobad(key, recs):
    for rec in recs:
        if rec.get('is_bad'):
            return
        deviceid = rec[ids.CRYPTA_DEVICE_ID]
        ua_profile = rec['ua_profile']
        ip = key['ip']
        ts = rec['ts']

        yield dict(key=deviceid, subkey='ip=' + ip + '|ts=' + str(ts), value='\tua_profile=' + ua_profile)


def reduce_ip_intervals(key, recs):
    deviceid = key['key']
    min_ts, max_ts = sys.maxint, 0
    ip, ua_profile, prev_ip = '', '', ''

    for rec in recs:
        ua_profile = mr.get_field_value('ua_profile', rec['value'])
        ip = mr.get_field_value('ip', rec['subkey'], '|')
        ts = int(mr.get_field_value('ts', rec['subkey'], '|'))

        if prev_ip != ip:
            if prev_ip:
                yield dict(key=prev_ip + '|' + ua_profile, subkey='a',
                           value='min_ts=' + str(min_ts) + '\tmax_ts=' + str(max_ts) + '\tdevid=' + deviceid)
            min_ts, max_ts = ts - config.DELTA_TS, ts + config.DELTA_TS
            prev_ip = ip
        else:
            prev_max_ts = max_ts
            min_ts = min(ts - config.DELTA_TS, min_ts)
            max_ts = max(ts + config.DELTA_TS, max_ts)
            if max_ts - min_ts >= config.WINDOW_TS:
                yield dict(key=ip + '|' + ua_profile, subkey='a',
                           value='min_ts=' + str(min_ts) + '\tmax_ts=' + str(prev_max_ts) + '\tdevid=' + deviceid)
                min_ts = max_ts - config.DELTA_TS * 2

    if ip and ua_profile:
        yield dict(key=ip + '|' + ua_profile, subkey='a',
                   value='min_ts=' + str(min_ts) + '\tmax_ts=' + str(max_ts) + '\tdevid=' + deviceid)


def reduce_device_yuid_pairs(key, recs):
    intervals = []
    for rec in recs:
        if len(intervals) > 100000:
            oom_rec = dict(key)
            oom_rec['@table_index'] = 1
            yield oom_rec
            return
        if rec['subkey'] == 'a':
            devid = mr.get_field_value('devid', rec['value'])
            max_ts = int(mr.get_field_value('max_ts', rec['value']))
            min_ts = int(mr.get_field_value('min_ts', rec['value']))
            intervals.append((min_ts, max_ts, devid))
        else:
            yuid = mr.get_field_value('yuid', rec['value'])
            user_agent = mr.get_field_value('user_agent', rec['value'])
            wapprofile = mr.get_field_value('wapprofile', rec['value'])
            ts = int(mr.get_field_value('ts', rec['value']))

            for (min_ts, max_ts, devid) in intervals:
                dev_yuid = devid + '_' + yuid
                if min_ts <= ts <= max_ts:
                    value = 'hit=1\tuser_agent=' + user_agent
                    if wapprofile:
                        value += '\twapprofile=' + wapprofile
                    yield dict(key=dev_yuid, subkey=key['key'], value=value)


def reduce_devid_uuid_all(key, recs):
    uuids = set()
    for rec in recs:
        uuids.add(rec['value'])
        if len(uuids) > 1000:
            break

    for uuid in uuids:
        yield dict(key=key['key'], subkey='du', value=uuid)


def reduce_yuids_ua(key, recs):
    subkey = ''
    c = 0
    for r in recs:
        subkey = r['subkey']
        c += 1

    yield dict(key=key['key'], subkey=subkey, value=str(c))


def run_matching_for_day(dt):
    ip_dev_stream_table = get_graph_table(dt, 'mobile/ip_dev_stream')
    partition_count = mr.calculate_optimized_mr_partition_count(ip_dev_stream_table)
    yt.run_map_reduce(None, reduce_ip_bad,
                      ip_dev_stream_table,
                      get_table_name(dt, ip_bad),
                      reduce_by='ip',
                      spec={'partition_count': partition_count})

    # Remove bad ips
    yt.run_map_reduce(None, reduce_ip_actions_nobad,
                      [get_table_name(dt, ip_bad),
                       ip_dev_stream_table],
                      get_table_name(dt, ip_actions_nobad),
                      reduce_by='ip',
                      sort_by=['ip', ids.CRYPTA_DEVICE_ID],  # this strange sorting allows ip_bad rec to come first
                      spec={'partition_count': partition_count})

    # Create activity intervals
    yt.run_map_reduce(None, reduce_ip_intervals,
                      get_table_name(dt, ip_actions_nobad),
                      get_table_name(dt, ip_intervals),
                      reduce_by='key',
                      sort_by=['key', 'subkey'],
                      spec={'partition_count': partition_count})

    # Compare intervals with mobile browser traffic
    out_tables = [get_table_name(dt, device_yuid_fuzzy_pairs), get_table_name(dt, device_yuid_fuzzy_pairs + '_oom')]
    yt.run_map_reduce(None, reduce_device_yuid_pairs,
                      [get_graph_table(dt, 'ip_yuid_stream'), get_table_name(dt, ip_intervals)],
                      out_tables,
                      reduce_by='key',
                      sort_by=['key', 'subkey'])

    mr.merge_chunks_all(out_tables)

    mr.drop(get_table_name(dt, ip_intervals))
    mr.drop(get_table_name(dt, ip_bad))
    mr.drop(get_table_name(dt, ip_actions_nobad))


class DeviceYuidsFuzzyIpMatchingDayTask(base_luigi_task.BaseTask):
    date = luigi.Parameter()
    run_date = luigi.Parameter()

    def requires(self):
        return [
            graph_import_fp.ImportFPDayTask(date=self.date, run_date=self.run_date),
            app_metrica_day.ImportAppMetrikaDayTask(date=self.date, run_date=self.run_date),
            graph_webvisor.ImportWebvisorTask(date=self.date, run_date=self.run_date),
        ]

    def run(self):
        mr.mkdir(config.INDEVICE_YT_FOLDER + self.date + '/fuzzy')

        run_matching_for_day(self.date)

    def output(self):
        return yt_luigi.YtTarget(get_table_name(self.date, device_yuid_fuzzy_pairs))

