#-*- coding: UTF-8 -*-
from common import *
import pandas as pd

def get_timestamp(ts):
    return ts / 1000000

PORNO_HOSTS = {"www.xvideos.com", "topporno.tv", "hotfiesta.com", "spree.link", "hotfiesta.tv",
               "semyana.pw", "www.pornhub.com", "porno-tour.xxx", "m.ru-xvideos.tv", "m.porn0sex.me",
               "100xyev.com", "720video.tv", "semyana.video", "www.ebalka.me", "www.24xxx.me",
               "www.ebalovo.info", "www.xnxx.com"}

GOOD_HOSTS = {"www.youtube.com", "ok.ru", "video.mail.ru", "frontend.vh.yandex.ru",
              "russia.tv", "www.1tv.ru", "argumentiru.com", "www.kinopoisk.ru", "www.newstube.ru",
              "www.ntv.ru", "www.vesti.ru", "vimeo.com"}

def update_uuid_info(groups):
    for key, recs in groups:
        InitEntityUrl = None
        InitEntityName = None
        is_porno = None
        has_new_info = False
        user_actions = 0
        for rec in recs:
            if rec.get("NewInitEntityUrl"):
                InitEntityUrl = rec["NewInitEntityUrl"]
                InitEntityName = rec["NewInitEntityName"]
                if "is_porno" in rec:
                    is_porno = rec["is_porno"]
                has_new_info = True
            elif not has_new_info:
                InitEntityUrl = rec["InitEntityUrl"]
                InitEntityName = rec["InitEntityName"]
                if "is_porno" in rec:
                    is_porno = rec["is_porno"]
            user_actions += rec.get("user_actions", 0)

        common_fields = {"uuid" : key["uuid"]}
        if is_porno != None:
            common_fields["is_porno"] = is_porno
        yield Record(InitEntityUrl=InitEntityUrl,
                     InitEntityName=InitEntityName,
                     user_actions=user_actions,
                     **common_fields)

def map_cmnt_reactions(recs):
    import urlparse
    for rec in recs:
        common_fields = {'ts' : str(get_timestamp(rec['event_timestamp'])),
                         'servicetype' : rec['ServiceSlug'],
                         'action_owner_puid' : str(rec['Puid']),
                         'uuid' : rec['EntityId'][0]}

        if 'ChatId' in rec:
            common_fields['chat_id'] = rec['ChatId']
        if 'InitEntityUrl' in rec:
            common_fields['entity_url'] = rec['InitEntityUrl'][0]
        if 'InitEntityName' in rec:
            common_fields['entity_name'] = rec['InitEntityName'][0]
        if "is_porno" in rec:
            common_fields["is_porno"] = rec["is_porno"]
        elif 'InitEntityUrl' in rec:
            try:
                InitEntityUrl = rec["InitEntityUrl"][0]
                parsed = urlparse.urlparse(InitEntityUrl)
                url = urlparse.parse_qs(parsed.query)['text'][0][4:]
                url_enc = url.encode("utf-8")
                host = urlparse.urlparse(url).netloc
                if host in PORNO_HOSTS:
                    common_fields["is_porno"] = True
                elif "sex" in host or "porn" in host or "xxx" in host:
                    common_fields["is_porno"] = True
                elif host in GOOD_HOSTS:
                    common_fields["is_porno"] = False
            except:
                pass

        comment_ids = []
        if 'Ids' in rec and rec["Ids"] != None:
            action_owner_comment_id = str(rec['Ids'][0])
        else:
            action_owner_comment_id = 'fake_comment_id'
        common_fields['action_owner_comment_id'] = action_owner_comment_id

        comment_ids = [action_owner_comment_id]
        if rec["UserAction"] in ["UA_REACTION", "UA_DELETE_REACTION"]:
            comment_ids = ['fake_comment_id', action_owner_comment_id]
        elif rec["UserAction"] == "UA_REPLY" or (rec["UserAction"] == "UA_DELETE_POST" and rec['PuidTo'] != 0):
            if rec.get('ReplyTo'):
                comment_ids.append(str(rec['ReplyTo']))
            else:
                comment_ids.append('fake_comment_id')

        puids = [str(rec['Puid'])]
        if rec["UserAction"] in ["UA_REACTION", "UA_REPLY"]:
            puids = [str(rec['Puid']), str(rec['PuidTo'])]
        elif rec["UserAction"] in ["UA_DELETE_REACTION", "UA_DELETE_POST"] and rec['PuidTo'] != 0:
            puids = [str(rec['Puid']), str(rec['PuidTo'])]

        action_type = 'commentor other'
        if rec["UserAction"] in ["UA_POST", "UA_REPLY"]:
            action_type = 'comment'
        elif rec["UserAction"] == "UA_REACTION":
            action_type = 'like comment'
            common_fields['reaction_type'] = str(rec['ReactionType'])
        elif rec["UserAction"] == "UA_DELETE_POST":
            action_type = 'delete comment'
        elif rec["UserAction"] == "UA_DELETE_REACTION":
            action_type = 'delete reaction'

        for puid, comment_id in zip(puids, comment_ids):
            yield Record(puid=puid,
                         action_type=action_type,
                         comment_id=comment_id,
                         **common_fields)

def process_cmnt_logs_common(cluster, tables, path, tmp_path):
    job = cluster.job()

    logs_to_concat = [job.table(table) for table in tables]

    new_uuid_info = job.concat(*logs_to_concat) \
                       .filter(sf.custom(lambda x : x in ['ether', 'video'], 'ServiceSlug'),
                               sf.custom(lambda x : x and x[0], 'EntityId'),
                               sf.defined('InitEntityName'),
                               sf.defined('InitEntityUrl')) \
                       .project('InitEntityUrl', 'InitEntityName',
                                uuid=ne.custom(lambda x : x[0], 'EntityId')) \
                       .groupby('uuid') \
                       .aggregate(NewInitEntityUrl=na.any('InitEntityUrl'),
                                  NewInitEntityName=na.any('InitEntityName'))

    new_uuid_user_actions = job.concat(*logs_to_concat) \
                               .filter(sf.custom(lambda x : x in ['ether', 'video'], 'ServiceSlug'),
                                       sf.custom(lambda x : x and x[0], 'EntityId'),
                                       sf.custom(lambda x : x in ['UA_POST', 'UA_DELETE_POST', 'UA_REPLY', 'UA_REACTION', 'UA_DELETE_REACTION', 'UA_COMPLAIN'], 'UserAction')) \
                               .project(uuid=ne.custom(lambda x : x[0], 'EntityId')) \
                               .groupby('uuid') \
                               .aggregate(user_actions=na.count())

    new_uuid_info_with_user_actions = new_uuid_info.join(new_uuid_user_actions, by='uuid', type='left')

    to_concat = [new_uuid_info_with_user_actions]
    if cluster.driver.exists(UUID_INFO_TABLE):
        to_concat.append(job.table(UUID_INFO_TABLE))

    job.concat(*to_concat) \
       .groupby('uuid') \
       .reduce(update_uuid_info) \
       .sort('uuid') \
       .put(UUID_INFO_TABLE)

    job.run()

    job = cluster.job()
    to_concat = [job.table(table) for table in tables]
    job.concat(*to_concat) \
       .filter(sf.equals('Success', True),
               sf.custom(lambda x : x in ['ether', 'video'], 'ServiceSlug'),
               sf.custom(lambda x : x in ['UA_POST', 'UA_DELETE_POST', 'UA_REPLY', 'UA_REACTION', 'UA_DELETE_REACTION', 'UA_COMPLAIN'], 'UserAction')) \
       .project(ne.all(exclude=('InitEntityUrl', 'InitEntityName')), uuid=ne.custom(lambda x : x[0], 'EntityId')) \
       .join(job.table(UUID_INFO_TABLE), by='uuid', type='left') \
       .map(map_cmnt_reactions) \
       .sort(*SQUEEZE_SORT_FIELDS) \
       .put(tmp_path,
            schema=SQUEEZE_SCHEMA)

    job.run()

    if cluster.driver.exists(path):
        cluster.driver.remove(path)
    cluster.driver.copy(tmp_path, path)
    cluster.driver.remove(tmp_path)

def process_cmnt_logs_fast(cluster, date_list):
    path = FAST_SQUEEZE_PREFIX + date_list[-1]
    tmp_path = FAST_TMP_SQUEEZE_PREFIX + date_list[-1]
    tables = [CMNT_ACCESS_LOG_PREFIX + FAST_LOG_PREFIX + date for date in date_list]
    process_cmnt_logs_common(cluster, tables, path, tmp_path)

def process_cmnt_logs_daily(cluster, date):
    path = SOCIALITY_SQUEEZE_PREFIX + CMNT_SQUEEZE_PREFIX + date
    tmp_path = SOCIALITY_TMP_SQUEEZE_PREFIX + CMNT_SQUEEZE_PREFIX + date
    tables = [CMNT_ACCESS_LOG_PREFIX + DAILY_LOG_PREFIX + date]
    process_cmnt_logs_common(cluster, tables, path, tmp_path)

def get_reaction_type(value):
    if value.get("Like"):
        return "like"
    elif value.get("Angry"):
        return "angry"
    elif value.get("Dislike"):
        return "dislike"
    elif value.get("Skip"):
        return "skip"
    elif value.get("Wow"):
        return "wow"
    elif value.get("Fire"):
        return "fire"
    elif value.get("HaHa"):
        return "haha"
    elif value.get("Bored"):
        return "bored"
    elif value.get("Sad"):
        return "sad"
    elif value.get("Heart"):
        return "heart"
    else:
        return "other"

def map_reaction_log(recs):
    import json
    for rec in recs:
        key_parts = rec["Key"].split('/')
        if len(key_parts) != 5:
            continue
        puid = rec["uid_prefix"] + key_parts[2]

        value = json.loads(rec["ValueJson"])
        if key_parts[3] in ["liked-video", "liked-vh-skip"]:
            reaction_type = get_reaction_type(value)
            action_type='reaction on video'
        elif key_parts[3] == "liked-vh-subs":
            reaction_type = 'subscription'
            action_type='subscription'
        else:
            continue

        if value.get("CreateTime"):
            ts = str(value["CreateTime"]["seconds"])
        elif value.get("Time"):
            ts = str(value["Time"]["seconds"])
        else:
            continue

        yield Record(ts=ts,
                     servicetype='ether',
                     puid=puid,
                     action_owner_puid=puid,
                     uuid=value['Id'],
                     action_type=action_type,
                     reaction_type=reaction_type)

def process_reaction_logs(cluster, date):
    path = SOCIALITY_SQUEEZE_PREFIX + REACTION_SQUEEZE_PREFIX + date
    tmp_path = SOCIALITY_TMP_SQUEEZE_PREFIX + REACTION_SQUEEZE_PREFIX + date
    date_dt = dt.strptime(date, "%Y-%m-%d")
    start_ts = int(time.mktime(date_dt.timetuple()))
    end_ts = int(time.mktime(date_dt.timetuple())) + 24 * 60 * 60

    job = cluster.job()
    yuids_reactions = job.table(REACTION_YUIDS_LOG) \
                        .filter(sf.or_(sf.contains('Key', 'liked-video'),
                                       sf.contains('Key', 'liked-vh-skip'),
                                       sf.contains('Key', 'liked-vh-subs'))) \
                        .project(ne.all(), uid_prefix=ne.const('y'))

    puids_reactions = job.table(REACTION_PUIDS_LOG) \
                        .filter(sf.or_(sf.contains('Key', 'liked-video'),
                                       sf.contains('Key', 'liked-vh-skip'),
                                       sf.contains('Key', 'liked-vh-subs'))) \
                        .project(ne.all(), uid_prefix=ne.const(''))

    job.concat(yuids_reactions, puids_reactions) \
        .map(map_reaction_log) \
        .filter(sf.custom(lambda x : int(x) >= start_ts and int(x) < end_ts, 'ts')) \
        .sort(*SQUEEZE_SORT_FIELDS) \
        .put(tmp_path, schema=SQUEEZE_SCHEMA)
    job.run()

    if cluster.driver.exists(path):
        cluster.driver.remove(path)
    cluster.driver.copy(tmp_path, path)
    cluster.driver.remove(tmp_path)
    return cluster.driver.client.get(path + '/@row_count') > 5000

def get_chat_action_type(payload_type):
    if payload_type == 1:
        return "chat comment"
    elif payload_type == 2:
        return "chat image"
    elif payload_type == 3:
        return "chat file"
    elif payload_type == 4:
        return "chat sticker"
    else:
        return "chat other"

def get_chat_answers(groups):
    for key, recs in groups:
        is_answer = False
        is_efir = False
        answered_chat_id = None
        answered_timestamp = None
        action_owner_puid = None
        action_timestamp = None
        action_payload_type = None
        for rec in recs:
            if rec["event_name"] == "TDepResolutionInfo":
                is_answer = True
                answered_chat_id = rec["ChatId"]
                answered_timestamp = rec["Timestamp"]
            if rec["event_name"] == "TMessageContent" and rec["ChatName"]:
                is_efir = True
                action_owner_puid = rec["Uid"]
                action_timestamp = rec["Timestamp"]
                action_payload_type = rec["PayloadType"]
        if is_answer and is_efir:
            yield Record(answered_chat_id=answered_chat_id,
                         answered_timestamp=answered_timestamp,
                         action_owner_puid=action_owner_puid,
                         action_timestamp=action_timestamp,
                         action_payload_type=action_payload_type)

def map_chat_log(recs):
    for rec in recs:
        common_fields = {'ts' : str(get_timestamp(rec['Timestamp'])),
                         'servicetype' : 'ether',
                         'puid' : str(rec['Uid']),
                         'uuid' : rec['ChatId'].split('/')[2],
                         'action_type' : get_chat_action_type(rec["PayloadType"])}
        if rec.get("action_owner_puid"):
            common_fields['ts'] = str(get_timestamp(rec['action_timestamp']))
            common_fields['action_type'] = get_chat_action_type(rec['action_payload_type'])
            common_fields['action_owner_puid'] = str(rec['action_owner_puid'])
        else:
            common_fields['action_owner_puid'] = str(rec['Uid'])
        yield Record(**common_fields)

def process_chat_logs(cluster, date):
    date_dt = dt.strptime(date, "%Y-%m-%d")
    start_ts = int(time.mktime(date_dt.timetuple()))
    end_ts = int(time.mktime(date_dt.timetuple())) + 24 * 60 * 60
    job = cluster.job()
    ether_source_uri_frame_id = job.table(CHAT_EVENTS_LOG_PREFIX + date) \
                                   .filter(sf.equals('Namespace', 7)) \
                                   .groupby('source_uri', 'frame_id') \
                                   .aggregate(count=na.count()) \
                                   .sort('source_uri', 'frame_id')

    job.table(CHAT_EVENTS_LOG_PREFIX + date) \
       .join(ether_source_uri_frame_id, by=['source_uri', 'frame_id']) \
       .put(CHAT_TOTAL_TABLE, append=True)

    job.run()

    job = cluster.job()
    job.table(CHAT_TOTAL_TABLE) \
       .sort('source_uri', 'frame_id') \
       .put(CHAT_TOTAL_TABLE)
    job.run()

    path = SOCIALITY_SQUEEZE_PREFIX + CHAT_SQUEEZE_PREFIX + date
    tmp_path = SOCIALITY_TMP_SQUEEZE_PREFIX + CHAT_SQUEEZE_PREFIX + date
    job = cluster.job()
    answers_to_join = job.table(CHAT_TOTAL_TABLE) \
                        .groupby('source_uri', 'frame_id') \
                        .reduce(get_chat_answers) \
                        .filter(sf.custom(lambda x : get_timestamp(x) >= start_ts and get_timestamp(x) < end_ts, 'action_timestamp'))

    answers = job.table(CHAT_TOTAL_TABLE) \
                .filter(sf.equals('event_name', 'TMessageContent'),
                        sf.custom(lambda x : x, 'ChatName')) \
                .join(answers_to_join, by_left=['ChatId', 'Timestamp'], by_right = ['answered_chat_id', 'answered_timestamp'])

    comments = job.table(CHAT_TOTAL_TABLE) \
                .filter(sf.equals('event_name', 'TMessageContent'),
                        sf.custom(lambda x : x, 'ChatName'),
                        sf.custom(lambda x : x and get_timestamp(x) >= start_ts and get_timestamp(x) < end_ts, 'Timestamp'))

    job.concat(answers, comments) \
       .map(map_chat_log) \
       .sort(*SQUEEZE_SORT_FIELDS) \
       .put(tmp_path,
            schema=SQUEEZE_SCHEMA)
    job.run()

    if cluster.driver.exists(path):
        cluster.driver.remove(path)
    cluster.driver.copy(tmp_path, path)
    cluster.driver.remove(tmp_path)

def map_emoji_logs(recs):
    for rec in recs:
        yield Record(ts=str(rec["timestamp"]),
                     servicetype='ether',
                     action_type='emoji ' + str(rec["emoji_type"]),
                     puid='y' + rec['yandexuid'],
                     action_owner_puid='y' + rec['yandexuid'],
                     uuid=rec['content_id'])

def process_emoji_logs(cluster, date):
    path = SOCIALITY_SQUEEZE_PREFIX + EMOJI_SQUEEZE_PREFIX + date
    tmp_path = SOCIALITY_TMP_SQUEEZE_PREFIX + EMOJI_SQUEEZE_PREFIX + date
    job = cluster.job()
    job.table(REDIR_LOG_PREFIX + "/" + date) \
       .filter(sf.contains('value','path=player-events.sticker-send')) \
       .qb2(log='redir-log',
            fields=['yandexuid', 'timestamp',
                    se.log_field('content_id'),
                    se.log_field('emoji_type') ],
            filters=[sf.defined('path', 'content_id', 'yandexuid', 'timestamp', 'emoji_type'),
                       sf.contains('path','player-events.sticker-send')
                      ],
            mode='yamr_lines', intensity='data') \
       .map(map_emoji_logs) \
       .sort(*SQUEEZE_SORT_FIELDS) \
       .put(tmp_path,
            schema=SQUEEZE_SCHEMA)
    job.run()

    if cluster.driver.exists(path):
        cluster.driver.remove(path)
    cluster.driver.copy(tmp_path, path)
    cluster.driver.remove(tmp_path)

def calc_final_table(cluster, date):
    if cluster.driver.exists(SOCIALITY_SQUEEZE_PREFIX + date):
        cluster.driver.remove(SOCIALITY_SQUEEZE_PREFIX + date)
    job = cluster.job()
    job.concat(job.table(SOCIALITY_SQUEEZE_PREFIX + REACTION_SQUEEZE_PREFIX + date),
               job.table(SOCIALITY_SQUEEZE_PREFIX + CMNT_SQUEEZE_PREFIX + date),
               job.table(SOCIALITY_SQUEEZE_PREFIX + CHAT_SQUEEZE_PREFIX + date),
               job.table(SOCIALITY_SQUEEZE_PREFIX + EMOJI_SQUEEZE_PREFIX + date)) \
       .sort(*SQUEEZE_SORT_FIELDS) \
       .put(SOCIALITY_SQUEEZE_PREFIX + date, schema=SQUEEZE_SCHEMA)
    job.run()

def get_date_list(fast_mode, start_date, end_date, cluster):
    if not fast_mode:
        return [str(date)[:10] for date in pd.date_range(start_date, end_date)]
    last_date = cluster.read(LAST_TIME_CMNT_FAST_SQUEEZE_TABLE)[0]["date"]
    return sorted(list(filter(lambda date: date > last_date, cluster.driver.list(CMNT_ACCESS_LOG_PREFIX + "stream/5min"))))

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--server', type=str, default='arnold')
    parser.add_argument('--start_date', type=str, required=True)
    parser.add_argument('--end_date', type=str, required=True)
    parser.add_argument('--process_cmnt_logs', type=bool, default=False)
    parser.add_argument('--process_reaction_logs', type=bool, default=False)
    parser.add_argument('--process_chat_logs', type=bool, default=False)
    parser.add_argument('--process_emoji_logs', type=bool, default=False)
    parser.add_argument('--calc_final_table', type=bool, default=False)
    parser.add_argument('--fast_mode', type=bool, default=False)
    args = parser.parse_args()

    if args.server == "arnold":
        cluster = clusters.yt.Arnold().env(parallel_operations_limit=10,
                                        yt_spec_defaults=dict(
                                            pool_trees=["physical"],
                                            use_default_tentative_pool_trees=True,
                                            max_failed_job_count=10,
                                            job_io={"table_writer" : {"max_row_weight" : 64 * 1024 * 1024}}
                                        ),
                                        templates=dict(
                                            tmp_root='//tmp',
                                            title='CalcEtherAndVideoSocialitySqueeze'
                                        ))
    elif args.server == "hahn":
        cluster = clusters.yt.Hahn().env(parallel_operations_limit=10,
                                        yt_spec_defaults=dict(
                                            pool_trees=["physical"],
                                            use_default_tentative_pool_trees=True,
                                            max_failed_job_count=10
                                        ),
                                        templates=dict(
                                            tmp_root='//tmp',
                                            title='CalcEtherAndVideoSocialitySqueeze'
                                        ))
    else:
        raise Exception("Unknown cluster")

    date_list = get_date_list(args.fast_mode, args.start_date, args.end_date, cluster)
    if args.fast_mode:
        process_cmnt_logs_fast(cluster, date_list)
        if date_list != []:
            cluster.driver.write(LAST_TIME_CMNT_FAST_SQUEEZE_TABLE, [Record(date=max(date_list))])
    else:
        for date_str in date_list:
            if args.process_cmnt_logs:
                process_cmnt_logs_daily(cluster, date_str)
            if args.process_reaction_logs:
                while not process_reaction_logs(cluster, date_str):
                    print "no reaction logs"
                    time.sleep(3600)
            if args.process_chat_logs:
                process_chat_logs(cluster, date_str)
            if args.process_emoji_logs:
                process_emoji_logs(cluster, date_str)
            if args.calc_final_table:
                calc_final_table(cluster, date_str)


if __name__ == '__main__':
    main()

