#-*- coding: UTF-8 -*-
from common import *

def has_interest_by_search_stat(rec,
                                sport_league_object_id, sport_league_queries,
                                sport_league_teams_object_ids, teams_to_fetch_object_ids,
                                fetch_params):
    from random import random
    has_interest = False
    source = None

    need_yield = False
    reqs_about_league = 0
    if sport_league_object_id:
        reqs_about_league += len(rec["web_requests_stats"].get(sport_league_object_id, {}).get("requests", []))
        reqs_about_league += len(rec["video_requests_stats"].get(sport_league_object_id, {}).get("requests", []))
        for query in sport_league_queries:
            for object_id in rec["web_requests_stats"]:
                if object_id == sport_league_object_id:
                    continue
                reqs_about_league += sum([query in elem.decode('utf8').lower() for elem in rec["web_requests_stats"][object_id]["requests"]])
            for object_id in rec["video_requests_stats"]:
                if object_id == sport_league_object_id:
                    continue
                reqs_about_league += sum([query in elem.decode('utf8').lower() for elem in rec["video_requests_stats"][object_id]["requests"]])

    league_reqs_with_teams = 0.
    league_reqs_without_teams = 0.

    for object_id in rec["web_requests_stats"]:
        if object_id in sport_league_teams_object_ids:
            requests = rec["web_requests_stats"][object_id]["requests"]
            if object_id in teams_to_fetch_object_ids:
                league_reqs_with_teams += len(requests)
            else:
                league_reqs_without_teams += len(requests)

    for object_id in rec["video_requests_stats"]:
        if object_id in sport_league_teams_object_ids:
            requests = rec["video_requests_stats"][object_id]["requests"]
            if object_id in teams_to_fetch_object_ids:
                league_reqs_with_teams += len(requests)
            else:
                league_reqs_without_teams += len(requests)

    if league_reqs_with_teams >= fetch_params["teams_request_threshold"]:
        probability_of_interest = 2. * float(league_reqs_with_teams) / (league_reqs_with_teams + league_reqs_without_teams)
        if random() <= probability_of_interest:
            has_interest = True
            source = "Team interest by requests"

    if reqs_about_league >= fetch_params["sport_league_request_threshold"]:
        has_interest = True
        source = "League interest by requests"

    return has_interest, source

def has_interest_by_tv_online_stats(rec,
                                    sport_league_channel, sport_league_teams, teams_to_fetch,
                                    fetch_params):
    from random import random
    has_interest = False
    source = None

    league_tvt = 0
    programs_tvt = 0
    channels_tvt = 0
    teams_to_fetch_tvt = 0
    other_league_teams_tvt = 0
    for object_id in rec["tv_online_stats"]:
        if sport_league_channel in rec["tv_online_stats"][object_id]["computed_channel"]:
            league_tvt += rec["tv_online_stats"][object_id]["tvt"]
            for team in sport_league_teams:
                if team.encode('utf8') in rec["tv_online_stats"][object_id]["computed_program"]:
                    if team in teams_to_fetch:
                        teams_to_fetch_tvt += rec["tv_online_stats"][object_id]["tvt"]
                    else:
                        other_league_teams_tvt += rec["tv_online_stats"][object_id]["tvt"]
        for program in fetch_params["tv_online_programs"]:
            if program.encode('utf8') in rec["tv_online_stats"][object_id]["computed_program"]:
                programs_tvt += rec["tv_online_stats"][object_id]["tvt"]
        for channel in fetch_params["tv_online_channels"]:
            if channel.encode('utf8') in rec["tv_online_stats"][object_id]["computed_channel"]:
                channels_tvt += rec["tv_online_stats"][object_id]["tvt"]
    if teams_to_fetch_tvt >= fetch_params["tv_online_team_tvt_threshold"]:
        probability_of_interest = 2. * float(teams_to_fetch_tvt) / (teams_to_fetch_tvt + other_league_teams_tvt)
        if random() <= probability_of_interest:
            source = "Team interest"
            has_interest = True
    if league_tvt >= fetch_params["tv_online_league_tvt_threshold"]:
        source = "League interest"
        has_interest = True
    if programs_tvt >= fetch_params["tv_online_programs_tvt_threshold"]:
        source = "Programs interest"
        has_interest = True
    if channels_tvt >= fetch_params["tv_online_channels_tvt_threshold"]:
        source = "Channels interest"
        has_interest = True

    return has_interest, source

def has_interest_by_browser_stats(rec, league_website, sport_leagues_teams_websites, teams_to_fetch_websites, fetch_params):
    from random import random
    has_interest = False
    source = None

    league_website_visits = 0
    teams_website_visits = 0
    other_teams_website_visits = 0

    for url in sport_leagues_teams_websites:
        for el in rec["urls"]:
            try:
                if url in el.decode('utf8'):
                    if url in teams_to_fetch_websites:
                        teams_website_visits += rec["urls"][el]
                    else:
                        other_teams_website_visits += rec["urls"][el]
                if league_website and league_website in el.decode('utf8'):
                    league_website_visits += rec["urls"][el]
            except:
                continue

    if teams_website_visits >= fetch_params["team_website_visits"]:
        probability_of_interest = 2. * float(teams_website_visits) / (teams_website_visits + other_teams_website_visits)
        if random() <= probability_of_interest:
            source = "Teams website visits"
            has_interest = True
    if league_website_visits + teams_website_visits + other_teams_website_visits >= fetch_params["league_website_visits"]:
        source = "League website visits"
        has_interest = True

    return has_interest, source

class get_uids_for_push(object):
    def __init__(self, fetch_params):
        self.fetch_params = fetch_params
    def __call__(self, recs):

        sport_league_object_id = None
        sport_league_queries = []
        teams_to_fetch_object_ids = set()
        sport_league_teams_object_ids = set()

        teams_to_fetch = []
        sport_league_teams = []
        sport_league_channel = self.fetch_params["sport_league"].encode('utf8')

        teams_to_fetch_websites = set()
        sport_leagues_teams_websites = set()


        league_website = None

        for sport_league in self.fetch_params["config"]["sport_leagues"]:
            if sport_league["name"] == self.fetch_params["sport_league"]:
                league_website = sport_league.get("website", None)
                sport_league_object_id = sport_league.get("object_id", None)
                sport_league_queries = sport_league.get("query_words", [])
                if sport_league.get("channel_prefix"):
                    sport_league_channel = sport_league["channel_prefix"].encode('utf8')
                for elem in sport_league["teams"]:
                    name = elem["name"]
                    if elem.get("object_id"):
                        if name in self.fetch_params["teams"]:
                            teams_to_fetch_object_ids.add(elem["object_id"])
                            if elem.get("additional_object_ids"):
                                for object_id in elem["additional_object_ids"]:
                                    teams_to_fetch_object_ids.add(object_id)
                        sport_league_teams_object_ids.add(elem["object_id"])
                        if elem.get("additional_object_ids"):
                            for object_id in elem["additional_object_ids"]:
                                sport_league_teams_object_ids.add(object_id)

                    if name in self.fetch_params["teams"]:
                        teams_to_fetch.append(name)
                    sport_league_teams.append(name)

                    if elem.get("website"):
                        if name in self.fetch_params["teams"]:
                            for website in elem.get("related_sites", []):
                                teams_to_fetch_websites.add(website)
                            teams_to_fetch_websites.add(elem["website"])
                        for website in elem.get("related_sites", []):
                            sport_leagues_teams_websites.add(website)
                        sport_leagues_teams_websites.add(elem["website"])

        for rec in recs:
            has_search_interest, search_source = has_interest_by_search_stat(rec, sport_league_object_id, sport_league_queries,
                                                                             sport_league_teams_object_ids, teams_to_fetch_object_ids, self.fetch_params)

            has_tv_online_interest, tv_online_source = has_interest_by_tv_online_stats(rec, sport_league_channel, sport_league_teams, teams_to_fetch, self.fetch_params)
            has_browser_interest, browser_source = has_interest_by_browser_stats(rec, league_website, sport_leagues_teams_websites, teams_to_fetch_websites, self.fetch_params)

            need_yield = False
            source = None

            if has_search_interest:
                need_yield = True
                source = search_source
            elif has_tv_online_interest:
                need_yield = True
                source = tv_online_source
            elif has_browser_interest:
                need_yield = True
                source = browser_source

            if need_yield:
                yield Record(rec, source=source)

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--config', type=str, required=True)
    parser.add_argument('--sport_league', type=str, required=True)
    parser.add_argument('--teams', nargs='+', default=[])
    parser.add_argument('--teams_request_threshold', type=int, default=1e10)
    parser.add_argument('--sport_league_request_threshold', type=int, default=1e10)
    parser.add_argument('--tv_online_team_tvt_threshold', type=int, default=1e10)
    parser.add_argument('--tv_online_league_tvt_threshold', type=int, default=1e10)
    parser.add_argument('--tv_online_programs', nargs='+', default=[])
    parser.add_argument('--tv_online_programs_tvt_threshold', type=int, default=1e10)
    parser.add_argument('--tv_online_channels', nargs='+', default=[])
    parser.add_argument('--tv_online_channels_tvt_threshold', type=int, default=1e10)
    parser.add_argument('--output_table', type=str, required=True)
    parser.add_argument('--team_website_visits', type=int, default=1e10)
    parser.add_argument('--league_website_visits', type=int, default=1e10)
    args = parser.parse_args()

    with codecs.open(args.config, 'r', 'utf8') as inp:
        config = json.load(inp)

    fetch_params = { "config" : config,
                     "sport_league" : args.sport_league.decode('utf8'),
                     "teams" : [team.decode('utf8') for team in args.teams],
                     "teams_request_threshold" : args.teams_request_threshold,
                     "sport_league_request_threshold" : args.sport_league_request_threshold,
                     "tv_online_team_tvt_threshold" : args.tv_online_team_tvt_threshold,
                     "tv_online_league_tvt_threshold" : args.tv_online_league_tvt_threshold,
                     "tv_online_programs" : [program.decode('utf8') for program in args.tv_online_programs],
                     "tv_online_programs_tvt_threshold" : args.tv_online_programs_tvt_threshold,
                     "tv_online_channels" : [channel.decode('utf8') for channel in args.tv_online_channels],
                     "tv_online_channels_tvt_threshold" : args.tv_online_channels_tvt_threshold,
                     "team_website_visits" : args.team_website_visits,
                     "league_website_visits" : args.league_website_visits}

    cluster = clusters.yt.Hahn().env(parallel_operations_limit=10,
                                     yt_spec_defaults=dict(
                                         pool_trees=["physical"],
                                         tentative_pool_trees=["cloud"]
                                     ),
                                     templates=dict(
                                         tmp_root='//tmp',
                                         title='GetUidsForPushes'
                                     ))

    job = cluster.job()

    uids_with_info = job.table(STATS_PREFIX + AGGREGATED_STATS_SUFFIX) \
                        .filter(sf.defined('in_sup_base'),
                                sf.equals('in_sup_base', True)) \
                        .map(get_uids_for_push(fetch_params)) \
                        .filter(sf.custom(lambda x, y : x and y, 'need_push', 'install_id')) \
                        .sort('uid') \
                        .put(args.output_table + "_with_info")

    uids_with_info.groupby('install_id') \
                  .aggregate(count=na.count()) \
                  .project('install_id') \
                  .sort('install_id') \
                  .put(args.output_table)

    job.run()

if __name__ == '__main__':
    main()
