#!/usr/bin/env python

from rtcconf import config
import hh_util as util
from utils import mr_utils as mr
from utils import geo_utils
from lib.luigi import yt_luigi
from data_imports.import_logs.graph_import_fp import ImportFPDayTask
import luigi
import yt.wrapper as yt
from functools import partial
from collections import defaultdict


max_session_length = 6*3600
def find_daily_hh(history):
    """
    Try to split stream of (ts, uid) on single IP into households
    :param history: list of (ts, uid) pairs, must be sorted by ts
    :return: dict {hhid -> set of uids}, dict {uid -> hhid}
    here uid is an abstract user (or device) id (can be yuid, deviceid or something else)
    """
    hhid2ids = dict()
    id2hhid = dict()
    last_event = dict()     # {uid -> i such that history[i] is the last event for uid }
    max_hhid = 0
    for i, (ts, id) in enumerate(history):
        if (id in last_event) and (ts - history[last_event[id]][0] < max_session_length):
            hhid = id2hhid[id]
            # group all events from last_event[uid] to current into same HH
            for j in xrange(last_event[id] + 1, i):
                j_ts, j_id = history[j]
                j_hhid = id2hhid[j_id]
                if j_hhid != hhid:
                    _ids = hhid2ids[j_hhid]
                    hhid2ids[hhid] |= _ids
                    for _id in _ids:
                        id2hhid[_id] = hhid
                    del hhid2ids[j_hhid]
        else:
            if id not in id2hhid:
                id2hhid[id] = max_hhid
                hhid2ids[max_hhid] = {id}
                max_hhid += 1
        last_event[id] = i
    return hhid2ids, id2hhid


def merge_ip_history(key, recs, holiday):
    '''
    Split events on IP into daily HH groups
    Input records:
     table_index=0: events on ip (ip, member, history)
    '''
    history = []
    event_histograms = defaultdict(util.TEventHistogram, holiday=holiday)

    lookup = geo_utils.get_geo_lookup(use_local_bin_file=False)

    try:
        # don't process Yandex ips
        if lookup.is_in(key['ip'], 9000) or lookup.is_in(key['ip'], 9000) :
            return

        rid = lookup.lookup.regionByIp(key['ip']).id
        country = lookup.find_country_id(rid)

        # temporary solution : BB doesn't have enough space for all countries, so process only events from Russia
        # to turn on all countries, uncommit two lines below
        if country != 225:
            return
    except:
        return

    tz_offset = 10800   # +3 hours
    try:
        tz_offset = lookup.lookup.timezoneById(rid).offset
    except:
        pass

    for rec in recs:
        if rec['ts']:
            id_with_type = (rec['yuid'], 'y')
            history += [(rec['ts'], id_with_type)]
            if len(history) > util.max_events_on_IP_per_day:
                return
            event_histograms[id_with_type].add(rec['ts'] + tz_offset)

#    history.sort(key=lambda x, y: x)
    hhid2ids, id2hhid = find_daily_hh(history)

    for hhid, ids in hhid2ids.iteritems():
        if len(ids) <= util.max_ids_in_daily_hh:
            for id, id_type in ids:
                member = {'id': id,
                          'id_type': id_type,
                          'events_home': event_histograms[(id, id_type)].home,
                          'events_work': event_histograms[(id, id_type)].work,
                          'rid': rid}
                yield {'id': id,
                       'id_type': id_type,
                       'ip': key['ip'],
                       'hhid': hhid,
                       'member': member,
                       'time_hist': event_histograms[(id, id_type)].save()}


def parse_geo(rec):
    yuid = rec.get('yauid')
    uuid = rec.get('uuid')
    acc_str = rec.get('acc', '')
    acc = float(acc_str) if acc_str else 1000000
    ts = int(rec.get('unixtime', 0))
    lat = float(rec.get('lat', 0))
    lon = float(rec.get('lon', 0))
    ip = rec.get('ip')
    if (yuid or uuid) and acc < 1000 and ts and lat != 0 and lon != 0 and ip:
        rec = {'lat': lat, 'lon': lon, 'ts': ts, 'ip': ip}
        if yuid:
            rec.update({'id': yuid, 'id_type': 'y', '@table_index': 0})
            yield rec
        if uuid:
            rec.update({'uuid': uuid, '@table_index': 1})
            yield rec


def uuid_to_devid(key, recs):
    devid = None
    for rec in recs:
        if rec['@table_index'] == 0:
            devid = rec.get(devid)
        else:
            if devid:
                yield {'id': devid, 'id_type': 'd', 'lat': rec['lat'], 'lon': rec['lon'], 'ts': rec['ts']}


geo_scale = 100000  # approximate number of meters in 1 degree of latitude/longtitude
def add_geo(key, recs):
    '''
    table_index=0: HH member (id, id_type, member, time_hist, hhid, ip)
    table_index=1: yuid geo (id, id_type, lat, lon, ts)
    table_index=2: devid geo (id, id_type, lat, lon, ts)
    '''
    hh_contexts = []
    for rec in recs:
        if rec['@table_index'] == 0:
            hh_contexts.append((util.TEventHistogram(rec['time_hist']), dict(rec['member']), defaultdict(int),
                                rec['hhid'], rec['ip']))
        else:
            if not hh_contexts:
                return
            for time_hist, member, geo, _, _ in hh_contexts:
                if time_hist.ts_in_hist(rec['ts']):
                    geo[int(geo_scale*rec['lat']), int(geo_scale*rec['lon'])] += 1
    for time_hist, member, geo, hhid, ip in hh_contexts:
        if geo:
            lat, lon = max(geo.iteritems(), key=lambda x: x[1])[0]
            member['geo_pts'] = [[float(lat)/geo_scale, float(lon)/geo_scale]]
        yield {'ip': ip, 'hhid': hhid, 'member': member}


def collect_daily_group(key, recs, date):
    members = [rec['member'] for rec in recs]
    yield {'hhid': key['hhid'], 'members': members, 'ip': key['ip'], 'date': date}


class HHDailyTask(yt_luigi.BaseYtTask):
    date = luigi.Parameter()

    def input_folders(self):
        return {
            'daily_graph': config.YT_OUTPUT_FOLDER + self.date + '/',
            'geo': config.STATBOX_RTGEO_FOLDER,
            'dict': config.GRAPH_YT_DICTS_FOLDER
        }

    def output_folders(self):
        return {
            'hh_daily': config.HH_FOLDER2 + 'daily_hh/',
        }

    def requires(self):
        return [
            ImportFPDayTask(date=self.date, run_date=self.date),
            yt_luigi.ExternalInput(self.in_f('geo') + self.date),
        ]

    def run(self):
        ip_ts_yuid_table = self.in_f('daily_graph') + 'ip_ts_yuid'

        mr.mkdir(self.out_f('hh_daily'))
        with yt.TempTable() as tmp_history_table,\
                yt.TempTable() as tmp_members, \
                yt.TempTable() as tmp_members_with_geo,\
                yt.TempTable() as tmp_geo_yuid, \
                yt.TempTable() as tmp_geo_uuid,\
                yt.TempTable() as tmp_geo_devid:

            yt.run_sort(ip_ts_yuid_table, tmp_history_table, sort_by=['ip', 'ts'])
            yt.run_reduce(partial(merge_ip_history, holiday=util.is_holiday(self.date)),
                          tmp_history_table,
                          tmp_members,
                          yt_files=['//statbox/statbox-dict-last/geodata4.bin'],
                          reduce_by='ip',
                          sort_by=['ip', 'ts'])

            yt.run_map(parse_geo, self.in_f('geo') + self.date, [tmp_geo_yuid, tmp_geo_uuid])

            tmp_geo_tables = [tmp_geo_yuid]
            if yt.exists(self.in_f('dict')+'uuid_dev_info_yt'):
                yt.run_sort(tmp_geo_uuid, sort_by='uuid')
                yt.run_reduce(uuid_to_devid,
                              [self.in_f('dict')+'uuid_dev_info_yt', tmp_geo_uuid],
                              tmp_geo_devid,
                              reduce_by='uuid')
                tmp_geo_tables.append(tmp_geo_devid)

            mr.sort_all([tmp_members]+tmp_geo_tables, sort_by=['id', 'id_type'])
            yt.run_reduce(add_geo, [tmp_members]+tmp_geo_tables, tmp_members_with_geo,
                          reduce_by=['id', 'id_type'])

            yt.run_sort(tmp_members_with_geo, sort_by=['ip', 'hhid'])
            yt.run_reduce(partial(collect_daily_group, date=self.date), tmp_members_with_geo,
                          self.out_f('hh_daily') + self.date, reduce_by=['ip', 'hhid'])

    def output(self):
        return [yt_luigi.YtTarget(self.out_f('hh_daily') + self.date)]

if __name__ == '__main__':
    import sys

    dt = sys.argv[1]

    yt.config.set_proxy(config.MR_SERVER)
    yt.config["tabular_data_format"] = yt.YsonFormat(process_table_index=True)

    config.HH_FOLDER2 = '//home/crypta/team/shiryaev/test_hh/'

    task = HHDailyTask(dt)

    print 'hh_daily:', task.out_f('hh_daily')

    task.run()

    print 'Done.'

