#-*- coding: UTF-8 -*-
from common import *
import pandas as pd

def is_succesful_web_request(request):
    for click in request.GetClicks():
        if click.DwellTime > 120:
            return True
    return False

def get_serial_season(request, watched_series={}):
    relev = request.RelevValues
    serial = relev.get('vserial')
    if not serial:
        return []
    season = relev.get('vseason', 0)
    episode = relev.get('vepisode', 0)
    return ["season_{}_episode_{}".format(season, episode)] + watched_series.get(str(serial), [])

def get_web_object_info(request):
    spv = request.SearchPropsValues
    if str(spv.get('UPPER.EntitySearch.Accept')) == '1':
        return spv.get('UPPER.EntitySearch.Ontoid', UNDEFINED_OBJECT_ID)
    return UNDEFINED_OBJECT_ID

def get_succesful_video_watches(request):
    count = 0
    series = {}
    for bl in request.GetMainBlocks():
        result = bl.GetMainResult()
        if not result.IsA('TVideoResult'):
            continue
        heartbeat = request.FindVideoHeartbeat(result, 'SINGLE')
        if heartbeat:
            if result.SerialId:
                serial_id = str(result.SerialId)
                if result.SerialSeason:
                    season = result.SerialSeason
                else:
                    season = 0
                if result.SerialEpisode:
                    episode = result.SerialEpisode
                else:
                    episode = 0
                series[serial_id] = series.get(serial_id, []) + ["season_{}_episode_{}".format(season, episode)]
            count += 1
    return count, series

def get_video_object_info(request):
    spv = request.SearchPropsValues
    return spv.get('UPPER.VideoExtraItems.object_id', UNDEFINED_OBJECT_ID)

class calc_web_video_stats(object):
    def __init__(self, config = {}):
        self.config = config

    def __call__(self, groups):
        import urlparse
        import libra

        for key, recs in groups:
            uid = key.key
            puid = None

            try:
                s = libra.ParseSession(recs, './blockstat.dict')
            except Exception as e:
                continue

            web_requests_stats = {}
            video_requests_stats = {}
            vitrine_stats = {}

            os_family = "unknown"
            browser = "unknown"

            need_yield = False
            for r in s:

                if r.PassportUID:
                    need_yield = True
                    puid = r.PassportUID

                if r.IsA("TWebRequestProperties"):
                    if not is_succesful_web_request(r):
                        continue

                    need_yield = True
                    object_id = get_web_object_info(r)
                    serial_season = get_serial_season(r)
                    if object_id in web_requests_stats:
                        web_requests_stats[object_id]["requests"] += [r.Query]
                        web_requests_stats[object_id]["timestamps"].append(r.Timestamp)
                        web_requests_stats[object_id]["series"] += serial_season
                    else:
                        web_requests_stats[object_id] = {"requests" : [r.Query], "timestamps" : [r.Timestamp], "series" : serial_season}
                elif r.IsA("TVideoRequestProperties"):
                    for block in r.GetBSBlocks():
                        if block.Path.startswith("/wizard/vitrina/p/click/p"):
                            is_personal = False
                            vitrine_object_id = None
                            for var in block.GetVars():
                                if var[0] == '-personal' and var[1] == '1':
                                    is_personal = True
                                if var[0] == '-id':
                                    vitrine_object_id = var[1]
                            if is_personal and vitrine_object_id:
                                need_yield = True
                                vitrine_stats[vitrine_object_id] = vitrine_stats.get(vitrine_object_id, 0) + 1

                    succesful_watches_count, watched_series =  get_succesful_video_watches(r)
                    if succesful_watches_count == 0:
                        continue

                    need_yield = True
                    object_id = get_video_object_info(r)
                    serial_season = get_serial_season(r, watched_series)
                    if object_id in video_requests_stats:
                        video_requests_stats[object_id]["requests"] += [r.Query]
                        video_requests_stats[object_id]["watches"] += succesful_watches_count
                        video_requests_stats[object_id]["timestamps"].append(r.Timestamp)
                        video_requests_stats[object_id]["series"] += serial_season
                    else:
                        video_requests_stats[object_id] = {"requests" : [r.Query], "watches" : succesful_watches_count, "timestamps" : [r.Timestamp], "series" : serial_season}
                else:
                    continue

            if need_yield:
                yield Record(uid=uid, puid=puid, os_family=os_family, browser=browser,
                             web_requests_stats=web_requests_stats,
                             video_requests_stats=video_requests_stats,
                             vitrine_stats=vitrine_stats)

def calc_tv_online_stats(groups):
    for key, recs in groups:
        uid = key["yandexuid"]
        stats = {}
        for rec in recs:
            if not rec["content_id"] in stats:
                stats[rec["content_id"]] = {"computed_program" : rec["computed_program"],
                                            "computed_channel" : rec["computed_channel"],
                                            "tvt" : 30 * rec["heartbeat_count"],
                                            "timestamp" : rec["timestamp"],
                                            "source" : rec["source"],
                                            "watched_time" : rec.get("watched_time", 30 * rec["heartbeat_count"]),
                                            "from_block" : rec.get("from_block"),
                                            "stream_block" : rec.get("stream_block"),
                                            "mute" : rec.get("mute"),
                                            "channel_id" : rec.get("channel_id")}
            else:
                stats[rec["content_id"]]["tvt"] += 30 * rec["heartbeat_count"]
                stats[rec["content_id"]]["timestamp"] = min(stats[rec["content_id"]]["timestamp"], rec["timestamp"])
                stats[rec["content_id"]]["watched_time"] = max(stats[rec["content_id"]]["watched_time"], rec.get("watched_time", 30 * rec["heartbeat_count"]))
        yield Record(uid=uid, tv_online_stats=stats)

def prepare_tv_online_stats_to_join(recs):
    for rec in recs:
        for content_id in rec["tv_online_stats"]:
            yield Record(uid=rec["uid"], uuid=content_id, data=rec["tv_online_stats"][content_id])

def prepare_tv_online_stats_to_map(groups):
    for key, recs in groups:
        tv_online_stats = {}
        for rec in recs:
            uuid = rec["uuid"]
            if rec.get("original_uuid"):
                uuid = rec["original_uuid"]
            tv_online_stats[uuid] = deepcopy(rec["data"])
            if rec.get("duration"):
                tv_online_stats[uuid]["duration"] = int(rec["duration"])
            else:
                tv_online_stats[uuid]["duration"] = 0
        yield Record(uid=key["uid"], tv_online_stats=tv_online_stats)

def calc_push_stats(groups):
    for key, recs in groups:
        push_stats = {}
        for rec in recs:
            push_name = rec["push_id"].split('.')[0]
            event = rec["normal_path"]
            if push_name in push_stats:
                push_stats[push_name][event] = push_stats[push_name].get(event, 0) + 1
            else:
                push_stats[push_name] = {event : 1}
        yield Record(uid=key["icookie"], push_stats=push_stats)

def is_words_in_text(words, text):
    for word in words:
        is_word_in_text = False
        for text_word in text.split(' '):
            if word == text_word:
                is_word_in_text = True
                break
        if not is_word_in_text:
            return False
    return True

def calc_bar_navig_stats(recs):
    for rec in recs:
        yandexuid = rec["yandexuid"]
        timestamp = rec["_logfeller_timestamp"]
        http_params = rec.get("http_params", "")
        is_new_version = True

        for param in http_params.split('&'):
            if param.startswith("ver="):
                try:
                    current_version = param[4:]
                    vals = current_version.split('.')
                    if int(vals[0]) < 72 or int(vals[1]) < 0 or int(vals[2]) < 3626:
                        is_new_version = False
                except:
                    continue

        title = None
        url = None
        if rec.get("decoded_bundle"):
            puid = None
            try:
                for elem in json.loads(rec["decoded_bundle"]):
                    if elem.get("params") and "uid_portal" in elem["params"]:
                        puid = elem["params"]["uid_portal"]
                if puid:
                    yield Record(yandexuid=yandexuid,
                                timestamp=timestamp,
                                puid=puid,
                                type="puid")
            except:
                pass

        for param in http_params.split('&'):
            if param.startswith("decoded_vc="):
                vc = param[11:]
                try:
                    vc = json.loads(urllib.unquote(urllib.unquote(vc)))
                    if is_new_version:
                        for stats in vc:
                            page_url = stats["url"]
                            for stat in stats["data"]:
                                if float(stat["duration"]) <= 30:
                                    continue
                                if float(stat["played_duration"]) < 1:
                                    continue
                                if type(frame_url) != str:
                                    continue
                                yield Record(yandexuid=yandexuid,
                                            timestamp=timestamp,
                                            page_url=page_url,
                                            frame_url=stat["frame_url"],
                                            view_time=stat["played_duration"],
                                            duration=stat["duration"],
                                            type="view")
                    else:
                        for elem in vc:
                            for page_url in elem:
                                for stat in elem[page_url]["p"]:
                                    corr = -1
                                    if len(stat[0]) > 1:
                                        corr = 0
                                    view_time = float(stat[5 + corr])
                                    duration = float(stat[4 + corr])
                                    if duration <= 30:
                                        continue
                                    frame_url = stat[11 + corr]
                                    if type(frame_url) != str:
                                        continue
                                    yield Record(yandexuid=yandexuid,
                                                 timestamp=timestamp,
                                                 page_url=page_url,
                                                 frame_url=frame_url,
                                                 view_time=view_time,
                                                 duration=duration,
                                                 type="view")
                except:
                    continue
            if param.startswith("url="):
                url = urllib.unquote(urllib.unquote(param[4:]))
            if param.startswith("title="):
                title = urllib.unquote(urllib.unquote(param[6:])).replace('+', ' ')
        if url != None and title != None:
            yield Record(yandexuid=yandexuid,
                         timestamp=timestamp,
                         url=url,
                         title=title,
                         type="visit")

def reduce_bar_navig_stats(groups):
    MAX_URL_LEN = 100
    MAX_TITLE_LEN = 100
    MAX_URL_COUNT = 10000
    for key, recs in groups:
        view_time_by_url = Counter()
        visits_by_url = Counter()
        visits_by_title = Counter()
        for rec in recs:
            if rec["type"] == "view":
                if len(rec["frame_url"]) > MAX_URL_LEN:
                    continue
                view_time_by_url[rec["frame_url"]] += rec["view_time"]
                view_time_by_url[rec["frame_url"]] = min(view_time_by_url[rec["frame_url"]], rec["duration"])
            if rec["type"] == "visit":
                if len(rec["url"]) > MAX_URL_LEN or len(rec["title"]) > MAX_TITLE_LEN:
                    continue
                visits_by_url[rec["url"]] += 1
                visits_by_title[rec["title"]] += 1
        if len(visits_by_url) > MAX_URL_COUNT or len(view_time_by_url) > MAX_URL_COUNT:
            continue
        if view_time_by_url or visits_by_url or visits_by_title:
            yield Record(uid=key["yandexuid"],
                         view_time_by_url=view_time_by_url,
                         visits_by_url=visits_by_url)

def get_stats_tables(cluster, stats_prefix, date, days):
    job = cluster.job()
    stats_tables = []

    for i in range(days):
        tablePath = stats_prefix + dt.strftime(date - timedelta(i), "%Y-%m-%d")
        if cluster.driver.exists(tablePath):
            stats_tables.append(job.table(tablePath))

    job.run()

    return stats_tables

def calc_stats_by_date(server, date_str, mr_account,
                       process_user_sessions,
                       process_tv_online,
                       process_pushes_stats,
                       process_bar_navig,
                       calc_uid_install_id_table,
                       calc_aggregated_tables,
                       days_to_aggregate,
                       need_salt):
    tmp_root = "//tmp"
    if server == "arnold":
        cluster = clusters.yt.Arnold().env(parallel_operations_limit=10,
                                        yt_spec_defaults=dict(
                                            pool_trees=["physical"],
                                            use_default_tentative_pool_trees=True,
                                            max_failed_job_count=10,
                                            job_io={"table_writer" : {"max_row_weight" : 64 * 1024 * 1024}}
                                        ),
                                        templates=dict(
                                            tmp_root=tmp_root,
                                            title='CalcStatsForPushes'
                                        ))
    elif server == "hahn":
        cluster = clusters.yt.Hahn().env(parallel_operations_limit=10,
                                        yt_spec_defaults=dict(
                                            pool_trees=["physical"],
                                            use_default_tentative_pool_trees=True,
                                            max_failed_job_count=10,
                                            job_io={"table_writer" : {"max_row_weight" : 64 * 1024 * 1024}}
                                        ),
                                        templates=dict(
                                            tmp_root=tmp_root,
                                            title='CalcStatsForPushes'
                                        ))
    else:
        raise Exception("Unknown cluster")

    # calc web_video and browser stast
    us_table = USER_SESSIONS_PREFIX + date_str + SESSIONS_CLEAN_SUFFIX
    us_staff_table = USER_SESSIONS_PREFIX + date_str + SESSIONS_STAFF_SUFFIX

    spy_table = SPY_SESSIONS_PREFIX + date_str + SESSIONS_CLEAN_SUFFIX
    spy_staff_table = SPY_SESSIONS_PREFIX + date_str + SESSIONS_STAFF_SUFFIX

    mobile_spy_table = 'statbox/metrika-mobile-log/' + date_str

    job = cluster.job()
    # calc user_sessions stats

    if process_user_sessions:
        while not cluster.driver.exists(us_table) or not cluster.driver.exists(us_staff_table):
            print "No user_sessions tables"
            time.sleep(100)
        reqs = job.concat(job.table(us_table), job.table(us_staff_table)) \
                  .groupby('key') \
                  .sort('subkey') \
                  .reduce(calc_web_video_stats(),
                        files=[nile.files.RemoteFile('statbox/statbox-dict-last/blockstat.dict'),
                               nile.files.RemoteFile('statbox/resources/libra.so')],
                        memory_limit=4000) \
                  .sort('uid') \
                  .put(SEARCH_STATS_PREFIX + date_str + "_pre_final")

    #calc tv online stats
    if process_tv_online:
        while not (cluster.driver.exists(REDIR_LOG_PREFIX + '/' + date_str)):
            print "No redir tables"
            time.sleep(100)
        redir = job.table(REDIR_LOG_PREFIX + '/' + date_str)
        data = job.table(STRM_META_PATH)
        t = redir.filter(sf.contains('value','path=player-events.heartbeat')) \
                 .qb2(log='redir-log',
                fields=['path', 'yandexuid', 'timestamp',
                        se.log_field('content_id'),
                        se.log_field('channel_id'),
                        se.log_field('source'),
                        se.log_field('watchedTime'),
                        se.log_field('from_block'),
                        se.log_field('stream_block'),
                        se.log_field('mute'),
                        se.custom('is_hb',lambda x: 1 if 'heartbeat' in str(x) else 0,'path')
                        ],
                filters=[sf.defined('path','content_id', 'yandexuid', 'timestamp'),
                        sf.contains('path','player-events.heartbeat')
                        ],
                mode='yamr_lines', intensity='data'
                ).groupby('content_id', 'yandexuid', 'source') \
                .aggregate(heartbeat_count=na.sum('is_hb'),
                           timestamp=na.min('timestamp'),
                           watched_time=na.max('watchedTime'),
                           from_block=na.any('from_block'),
                           stream_block=na.any('stream_block'),
                           mute=na.any('mute'),
                           channel_id=na.any('channel_id')) \
                .join(data, by_left='content_id', by_right='JoinKey', type='left')
        j1 = t.filter(sf.defined('computed_channel'))
        j2 = t.filter(sf.not_(sf.defined('computed_channel'))).project(ne.all(),computed_channel=ne.const('-'),computed_program=ne.const('-'))
        events = job.concat(j1,j2)
        events.groupby('yandexuid').reduce(calc_tv_online_stats).sort('uid').put(TV_ONLINE_STATS_PREFIX + date_str + "_pre_final")

    if process_pushes_stats:
        while not (cluster.driver.exists(REDIR_LOG_PREFIX + '/' + date_str)):
            print "No redir tables"
            time.sleep(100)
        job.table(REDIR_LOG_PREFIX + '/' + date_str).filter(sf.contains('value','push_id=tv_online')) \
            .qb2(
            log = 'redir-log',
            fields = [
                'yandexuid','date','path','normal_path','parsed_log_line','normal_vars','timestamp',
                se.log_field('icookie'),
                'pid','cid',
                se.custom('push_id', lambda x: x.get('push_id') if x else '-','normal_vars')
            ],
            filters = [
                sf.or_(sf.equals('cid','73559'),sf.equals('cid',73559)),
                sf.or_(sf.equals('pid', '457'),sf.equals('pid', 457)),
                sf.contains('push_id','tv_online')
            ],
            mode = 'yamr_lines',
        ).groupby('icookie').reduce(calc_push_stats).put(PUSHES_STATS_PREFIX + date_str + "_pre_final")

    if process_bar_navig:
        while not (cluster.driver.exists(BAR_NAVIG_PREFIX + '/' + date_str)):
            print "No bar navig log tables"
            time.sleep(100)
        job.table(BAR_NAVIG_PREFIX + '/' + date_str).filter(sf.not_(sf.equals('yandexuid', '-'))) \
           .map(calc_bar_navig_stats, memory_limit=16000) \
           .sort('yandexuid', 'timestamp') \
           .groupby('yandexuid') \
           .reduce(reduce_bar_navig_stats, memory_limit=16000) \
           .sort('uid') \
           .put(BAR_NAVIG_STATS_PREFIX + date_str + "_pre_final")

    if calc_uid_install_id_table:
        yuid_info_table = job.table(CRYPTA_YUID_INFO_TABLE)
        mobile_uuids = yuid_info_table.project(uid='id').join(job.table(CRYPTA_YUID_UUID_TABLE), by_left='uid', by_right='id') \
                                      .project('uid', install_id='target_id') \
                                      .join(job.table(APP_METRICA_ACTIVE_UUID_TABLE), by_left='install_id', by_right='id') \
                                      .filter(sf.custom(lambda x : x in PP_APP_IDS or x.startswith('com.yandex.browser'), 'app_id')) \
                                      .project('uid', 'install_id')

        desktop_yuids = yuid_info_table.project(uid='id', install_id='id')
        job.concat(desktop_yuids, mobile_uuids) \
           .sort('uid', 'install_id') \
           .put(UID_INSTALL_ID_TABLE)
    job.run()
    for prefix, need_process in zip([SEARCH_STATS_PREFIX, TV_ONLINE_STATS_PREFIX, PUSHES_STATS_PREFIX, BAR_NAVIG_STATS_PREFIX],
                                 [process_user_sessions, process_tv_online, process_pushes_stats, process_bar_navig]):
        if need_process:
            if cluster.driver.exists(prefix + date_str):
                cluster.driver.remove(prefix + date_str)
            cluster.driver.copy(prefix + date_str + "_pre_final", prefix + date_str)
            cluster.driver.remove(prefix + date_str + "_pre_final")

    if calc_aggregated_tables:

        stat_fields = ['web_requests_stats', 'video_requests_stats', 'vitrine_stats',
                       'tv_online_stats', 'push_stats', 'urls', 'views']

        date = dt.strptime(date_str, "%Y-%m-%d")

        search_stats_to_concat = get_stats_tables(cluster, SEARCH_STATS_PREFIX, date, days_to_aggregate)
        tv_online_stats_to_concat = get_stats_tables(cluster, TV_ONLINE_STATS_PREFIX, date, days_to_aggregate)
        pushes_stats_to_concat = get_stats_tables(cluster, PUSHES_STATS_PREFIX, date, days_to_aggregate)
        bar_navig_stats_to_concat = get_stats_tables(cluster, BAR_NAVIG_STATS_PREFIX, date, days_to_aggregate)

        job = cluster.job()

        search_stats_aggregated = job.concat(*search_stats_to_concat) \
                                     .groupby('uid') \
                                     .reduce(aggregate_search_stats, files=nfi_common)

        tv_online_stats_agggregated_without_original_uuid = job.concat(*tv_online_stats_to_concat) \
                                         .groupby('uid') \
                                         .reduce(aggregate_tv_online_stats, files=nfi_common)

        uuid_to_original_uuid = job.table(CONTENT_RESOURCE) \
                               .filter(sf.equals('ResourceName', 'original_yatv_uuid_src')) \
                               .join(job.table(CONTENT_GROUP), by='ContentGroupID') \
                               .project(uuid='UUID', original_uuid='Value')

        tv_online_stats_agggregated = tv_online_stats_agggregated_without_original_uuid.map(prepare_tv_online_stats_to_join) \
                                                                        .join(uuid_to_original_uuid, by='uuid', type='left') \
                                                                        .join(job.table(STRM_META_PATH), by_left='uuid', by_right='JoinKey', type='left') \
                                                                        .groupby('uid') \
                                                                        .reduce(prepare_tv_online_stats_to_map)

        pushes_stats_aggregated = job.concat(*pushes_stats_to_concat) \
                                     .groupby('uid') \
                                     .reduce(aggregate_pushes_stats, files=nfi_common)

        bar_navig_stats_aggregated = job.concat(*bar_navig_stats_to_concat) \
                                        .groupby('uid') \
                                        .reduce(aggregate_bar_navig_stats, files=nfi_common)

        stats = job.concat(search_stats_aggregated,
                          tv_online_stats_agggregated,
                          pushes_stats_aggregated,
                          bar_navig_stats_aggregated) \
                   .groupby('uid') \
                   .aggregate(**{x: na.any(x) for x in stat_fields})

        stats_with_puid_joined = stats.join(job.table(CRYPTA_UID_PUID_TABLE), by_left='uid', by_right='id', type='left')

        stats_with_puid = stats_with_puid_joined.filter(sf.defined('target_id')) \
                                                  .groupby('target_id') \
                                                  .reduce(aggregate_stats_puid, files=nfi_common) \
                                                  .groupby('uid') \
                                                  .reduce(get_best_uid_puid_map(stat_fields), files=nfi_common)

        stats_without_puid = stats_with_puid_joined.filter(sf.not_(sf.defined('target_id'))) \
                                                   .project(ne.all(exclude=stat_fields),
                                                            **{x : ne.custom(lambda y : y if y else {}, x) for x in stat_fields})

        job.concat(stats_without_puid, stats_with_puid) \
           .sort('uid') \
           .put("//tmp/msvvitaly/aggregated_pre_final")

        job.run()

        prepare_uids_to_push(cluster,
                             "//tmp/msvvitaly/aggregated_pre_final",
                             STATS_PREFIX + AGGREGATED_STATS_SUFFIX,
                             stat_fields,
                             need_salt, SPLIT_PATH)

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--server', type=str, default='arnold')
    parser.add_argument('--start_date', type=str, required=True)
    parser.add_argument('--end_date', type=str, required=True)
    parser.add_argument('--mr_account', type=str, default='videoindex')
    parser.add_argument('--process_user_sessions', type=int, default=0)
    parser.add_argument('--process_tv_online', type=int, default=0)
    parser.add_argument('--process_pushes_stats', type=int, default=0)
    parser.add_argument('--process_bar_navig', type=int, default=0)
    parser.add_argument('--calc_uid_install_id_table', type=int, default=0)
    parser.add_argument('--calc_aggregated_tables', type=int, default=0)
    parser.add_argument('--days_to_aggregate', type=int, default=180)
    parser.add_argument('--need_salt', type=int, default=1)
    args = parser.parse_args()

    for date in pd.date_range(args.start_date, args.end_date):
        calc_stats_by_date(args.server, str(date)[:10], args.mr_account,
                           args.process_user_sessions,
                           args.process_tv_online,
                           args.process_pushes_stats,
                           args.process_bar_navig,
                           args.calc_uid_install_id_table,
                           args.calc_aggregated_tables,
                           args.days_to_aggregate,
                           args.need_salt)

if __name__ == '__main__':
    main()
