#-*- coding: UTF-8 -*-
from common import *

def aggregate_search_stats(groups):
    for key, recs in groups:
        uid = key["uid"]
        if uid.startswith('y'):
            uid = uid[1:]
        elif uid.startswith('uu/'):
            uid = uid[3:]

        browser = "unknown"
        os_family = "unknown"
        special_events_stats = {}
        sport_leagues_stats = {}
        video_requests_stats = {}
        web_requests_stats = {}
        for rec in recs:
            browser = rec["browser"]
            os_family = rec["os_family"]
            update_event_stats(special_events_stats, rec["special_events_stats"], ["requests"])
            update_event_stats(sport_leagues_stats, rec["sport_leagues_stats"], ["requests", "shown_websites"])
            update_event_stats(video_requests_stats, rec["video_requests_stats"], ["timestamps", "requests", "watches", "series"])
            update_event_stats(web_requests_stats, rec["web_requests_stats"], ["timestamps", "requests", "series"])
        yield Record(uid=uid, os_family=os_family, browser=browser,
                     special_events_stats=special_events_stats,
                     sport_leagues_stats=sport_leagues_stats,
                     video_requests_stats=video_requests_stats,
                     web_requests_stats=web_requests_stats)

MAX_BROWSER_YOUTUBE_STATS = 100000

def aggregate_browser_stats(groups):
    for key, recs in groups:
        uid = key["uid"]
        if uid.startswith('y'):
            uid = uid[1:]
        elif uid.startswith('uu/'):
            uid = uid[3:]
        special_events_stats = {}
        sport_leagues_stats = {}
        youtube_stats = { "youtube_titles_stats": {},
                          "youtube_websites_stats": {}}
        for rec in recs:
            update_event_stats(special_events_stats, rec["special_events_stats"], ["matched_titles", "website_visits"])
            update_event_stats(sport_leagues_stats, rec["sport_leagues_stats"], ["matched_titles", "website_visits", "teams_stats"])
            for title in rec.get("youtube_stats", {}).get("youtube_titles_stats",  {}):
                youtube_stats["youtube_titles_stats"][title] = youtube_stats["youtube_titles_stats"].get(title, 0) + rec["youtube_stats"]["youtube_titles_stats"][title]
            for url in rec.get("youtube_stats", {}).get("youtube_websites_stats", {}):
                youtube_stats["youtube_websites_stats"][url] = youtube_stats["youtube_websites_stats"].get(url, 0) + rec["youtube_stats"]["youtube_websites_stats"][url]
        if len(youtube_stats["youtube_titles_stats"]) < MAX_BROWSER_YOUTUBE_STATS and len(youtube_stats["youtube_websites_stats"]) < MAX_BROWSER_YOUTUBE_STATS:
            yield Record(uid=uid,
                         special_events_stats=special_events_stats,
                         sport_leagues_stats=sport_leagues_stats,
                         youtube_stats=youtube_stats)

def aggregate_tv_online_stats(groups):
    for key, recs in groups:
        uid = key["uid"]
        tv_online_stats = {}
        for rec in recs:
            for object_id in rec["tv_online_stats"]:
                if object_id in tv_online_stats:
                    tv_online_stats[object_id]["tvt"] += rec["tv_online_stats"][object_id]["tvt"]
                    tv_online_stats[object_id]["timestamp"] = max(tv_online_stats[object_id]["timestamp"], rec["tv_online_stats"][object_id]["timestamp"])
                else:
                    tv_online_stats[object_id] = {"computed_channel" : rec["tv_online_stats"][object_id]["computed_channel"],
                                                  "computed_program" : rec["tv_online_stats"][object_id]["computed_program"],
                                                  "tvt" : rec["tv_online_stats"][object_id]["tvt"],
                                                  "timestamp" : rec["tv_online_stats"][object_id]["timestamp"]}
        yield Record(uid=uid, tv_online_stats=tv_online_stats)

class get_uids_for_push_by_search_stat(object):
    def __init__(self, fetch_params):
        self.fetch_params = fetch_params
    def __call__(self, recs):
        from random import random

        for rec in recs:
            reqs_about_objects = 0
            for object_id in self.fetch_params["object_ids"]:
                reqs_about_objects += len(rec["web_requests_stats"].get(object_id, {}).get("requests", []))
                reqs_about_objects += len(rec["video_requests_stats"].get(object_id, {}).get("requests", []))

            if reqs_about_objects >= self.fetch_params["objects_request_threshold"]:
                yield Record(rec, source="search")

class get_uids_for_push_by_tv_online(object):
    def __init__(self, fetch_params):
        self.fetch_params = fetch_params
    def __call__(self, recs):
        for rec in recs:
            programs_tvt = 0
            for object_id in rec["tv_online_stats"]:
                for program in self.fetch_params["tv_online_programs"]:
                    if program.encode('utf8') in rec["tv_online_stats"][object_id]["computed_program"]:
                        programs_tvt += rec["tv_online_stats"][object_id]["tvt"]
            if programs_tvt >= self.fetch_params["tv_online_programs_tvt_threshold"]:
                yield Record(rec, source="tv_online")

class get_uids_for_push_by_browser(object):
    def __init__(self, fetch_params):
        self.fetch_params = fetch_params
    def __call__(self, recs):
        for rec in recs:
            need_yield = False
            title_count = 0
            url_count = 0
            for title in rec["youtube_stats"].get("youtube_titles_stats", {}):
                for browser_title in self.fetch_params["youtube_browser_titles"]:
                    if browser_title in title:
                        title_count += rec["youtube_stats"]["youtube_titles_stats"][title]
                        break

            for url in rec["youtube_stats"].get("youtube_websites_stats", {}):
                for browser_url in self.fetch_params["youtube_browser_urls"]:
                    if browser_url in url:
                        url_count += rec["youtube_stats"]["youtube_websites_stats"][url]
                        break

            if title_count >= self.fetch_params["youtube_browser_titles_threshold"]:
                need_yield = True
                reason = "Title matched"
            if url_count >= self.fetch_params["youtube_browser_urls_threshold"]:
                need_yield = True
                reason = "Urls matched"
            if need_yield:
                yield Record(rec, source="browser", reason=reason)

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--date', type=str, required=True)
    parser.add_argument('--days_count', type=int, required=True)
    parser.add_argument('--object_ids', nargs='+', default=[])
    parser.add_argument('--objects_request_threshold', type=int, default=1e10)
    parser.add_argument('--tv_online_programs', nargs='+', default=[])
    parser.add_argument('--tv_online_programs_tvt_threshold', type=int, default=1e10)
    parser.add_argument('--youtube_browser_titles', nargs='+', default=[])
    parser.add_argument('--youtube_browser_titles_threshold', type=int, default=1e10)
    parser.add_argument('--youtube_browser_urls', nargs='+', default=[])
    parser.add_argument('--youtube_browser_urls_threshold', type=int, default=1e10)
    parser.add_argument('--output_table', type=str, required=True)
    args = parser.parse_args()

    fetch_params = { "object_ids" : args.object_ids,
                     "objects_request_threshold" : args.objects_request_threshold,
                     "tv_online_programs" : [program.decode('utf8') for program in args.tv_online_programs],
                     "tv_online_programs_tvt_threshold" : args.tv_online_programs_tvt_threshold,
                     "youtube_browser_titles" : args.youtube_browser_titles,
                     "youtube_browser_titles_threshold" : args.youtube_browser_titles_threshold,
                     "youtube_browser_urls" : args.youtube_browser_urls,
                     "youtube_browser_urls_threshold" : args.youtube_browser_urls_threshold }

    end_date = dt.strptime(args.date, "%Y-%m-%d")

    cluster = clusters.yt.Hahn().env(parallel_operations_limit=10,
                                     yt_spec_defaults=dict(
                                         pool_trees=["physical"],
                                         tentative_pool_trees=["cloud"]
                                     ),
                                     templates=dict(
                                         tmp_root='//tmp',
                                         title='GetUidsForPushes'
                                     ))

    job = cluster.job()

    search_stats_to_concat = [job.table(SEARCH_STATS_PREFIX +  dt.strftime(end_date - timedelta(i), "%Y-%m-%d")) for i in range(args.days_count)]
    tv_online_stats_to_concat = [job.table(TV_ONLINE_STATS_PREFIX +  dt.strftime(end_date - timedelta(i), "%Y-%m-%d")) for i in range(args.days_count)]
    desktop_browser_stats_to_concat = [job.table(DESKTOP_BROWSER_STATS_PREFIX +  dt.strftime(end_date - timedelta(i), "%Y-%m-%d")) for i in range(args.days_count)]
    mobile_browser_stats_to_concat = [job.table(MOBILE_BROWSER_STATS_PREFIX +  dt.strftime(end_date - timedelta(i), "%Y-%m-%d")) for i in range(args.days_count)]

    to_concat = []

    if fetch_params["object_ids"]:
        search = job.concat(*search_stats_to_concat) \
                    .groupby('uid') \
                    .reduce(aggregate_search_stats, files=nfi_common) \
                    .map(get_uids_for_push_by_search_stat(fetch_params))
        to_concat.append(search)

    if fetch_params["tv_online_programs"]:
        tv_online = job.concat(*tv_online_stats_to_concat) \
                       .groupby('uid') \
                       .reduce(aggregate_tv_online_stats, files=nfi_common) \
                       .map(get_uids_for_push_by_tv_online(fetch_params))
        to_concat.append(tv_online)

    if fetch_params["youtube_browser_titles"] or fetch_params["youtube_browser_urls"]:
        desktop_browser = job.concat(*desktop_browser_stats_to_concat) \
                             .groupby('uid') \
                             .reduce(aggregate_browser_stats, files=nfi_common) \
                             .map(get_uids_for_push_by_browser(fetch_params))

        mobile_browser = job.concat(*mobile_browser_stats_to_concat) \
                            .groupby('uid') \
                            .reduce(aggregate_browser_stats, files=nfi_common) \
                            .map(get_uids_for_push_by_browser(fetch_params))

        to_concat.append(desktop_browser)
        to_concat.append(mobile_browser)

    job.concat(*to_concat) \
       .put(args.output_table)

    job.run()

if __name__ == '__main__':
    main()
