#-*- coding: UTF-8 -*-
from common import *

MAX_VIDEO_DURATION = 5 * 60
FUTURE_GAP = 7200

RUBRIC_LIST_REQUEST = "https://news.yandex.ru/api/v2/rubric_list"
RUBRIC_STORIES_TEMPLATE = "https://news.yandex.ru/api/v2/rubric?rubric={}&count=100"
STORY_DATA_TEMPLATE = "https://news.yandex.ru/api/v2/story?url={}&video_sizes=100.100.10.orig&video_hosted=1"

def get_news_by_rubric():
    rubrics_info = requests.get(RUBRIC_LIST_REQUEST).json()
    rubrics = []
    for rubric_info in rubrics_info['rubrics']:
        rubrics.append(rubric_info["alias"])

    news_by_rubric = {}
    for rubric in rubrics:
        news_by_rubric[rubric] = []
        rubric_news = requests.get(RUBRIC_STORIES_TEMPLATE.format(rubric)).json()
        for story in rubric_news['items']:
            story_id = story['obj']['id']
            story_url = story['obj']['url']
            story_videos = []
            try:
                story_data = requests.get(STORY_DATA_TEMPLATE.format(story_url)).json()
            except:
                continue
            for video_item in story_data.get('obj', {}).get('video', {}).get('items', []):
                story_videos.append({'player_url' : video_item['hosted_video']['player_url'],
                                     'duration' : video_item['hosted_video']['duration'],
                                     'source_id' : str(video_item['doc']['source_id'])})
            if len(story_videos) > 0:
                news_by_rubric[rubric].append({"id" : story_id, "videos" : story_videos})
    return news_by_rubric

def add_rubric_videos(rubric_news, current_ts, end_ts, current_content_ids, schedule, source_ids_white_list, possible_news_content_ids, rubric_time_limit, rubric, current_duration, ads_period):
    rubric_duration = 0
    for news_info in rubric_news:
        for video_info in news_info["videos"]:
            content_id = video_info["player_url"].split('/')[-1]
            ## check if video already added
            if content_id in current_content_ids:
                continue
            ## skip too short videos
            if video_info["duration"] < MIN_CONTENT_DURATION or video_info["duration"] > MAX_VIDEO_DURATION:
                continue
            ## check if rubric duration exceeded maximum
            if video_info["duration"] + rubric_duration >= rubric_time_limit:
                continue
            ## check is source is in white list
            if not video_info['source_id'] in source_ids_white_list:
                continue
            ## check is video in possible news content ids
            if not content_id in possible_news_content_ids:
                continue
            ## check if schedule is completed
            if current_ts >= end_ts:
                return current_ts
            current_content_ids.add(content_id)
            video_start_ts = current_ts
            video_end_ts = current_ts + video_info["duration"]
            video_start_time_str = datetime.fromtimestamp(video_start_ts).strftime('%Y-%m-%d %H:%M:%S')
            video_end_time_str = datetime.fromtimestamp(video_end_ts).strftime('%Y-%m-%d %H:%M:%S')
            with_ads = (current_duration + video_info["duration"]) // ads_period > current_duration // ads_period
            schedule.append(Record(rubric=rubric, content_id=content_id,
                                   begin=video_start_ts, end=video_end_ts,
                                   dt=video_start_time_str, with_ads=with_ads))
            current_duration += video_info["duration"]
            current_ts += video_info["duration"]
            rubric_duration += video_info["duration"]
            break
    return current_ts

def make_news_schedule(news_by_rubric, start_ts, end_ts, source_ids_white_list, possible_news_content_ids, rubric_time_limit, ads_period):
    schedule = []
    current_ts = start_ts
    while current_ts < end_ts:
        current_content_ids = set()
        for rubric in rubric_time_limit:
            current_ts = add_rubric_videos(news_by_rubric.get(rubric, []), current_ts, end_ts, current_content_ids, schedule, source_ids_white_list, possible_news_content_ids, rubric_time_limit[rubric], rubric, current_ts - start_ts, ads_period)
    return schedule, current_ts

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--table', type=str, required=True)
    parser.add_argument('--source_ids_white_list', nargs='+', required=True)
    parser.add_argument('--rubrics', nargs='+', required=True)
    parser.add_argument('--time_limits', nargs='+', required=True)
    parser.add_argument('--schedule_gap', type=int, required=True)
    parser.add_argument('--with_ads', type=int, required=True)
    parser.add_argument('--channel_id', type=str, required=True)
    parser.add_argument('--ads_period', type=int, required=True)
    args = parser.parse_args()

    rubrics = args.rubrics
    rubric_time_limit = {}
    for i in range(len(rubrics)):
        rubric_time_limit[rubrics[i]] = int(args.time_limits[i])

    cluster = clusters.yt.Arnold().env(parallel_operations_limit=10,
                                     yt_spec_defaults=dict(
                                         pool_trees=["physical"],
                                         tentative_pool_trees=["cloud"]
                                     ),
                                     templates=dict(
                                         tmp_root='//tmp',
                                         title='GetNewsSchedule'
                                     ))

    news_by_rubric = get_news_by_rubric()

    possible_news_content_ids_table = "//tmp/msvvitaly/possible_news_content_ids_" + str(time())
    job = cluster.job()
    job.table(ACTUAL_URLS) \
       .map(filter_bitrate, files=nfi_common) \
       .filter(sf.equals('ContentTypeID', 48)) \
       .project('ContentGroupID').put(possible_news_content_ids_table)
    job.run()

    possible_news_content_ids = set()
    for rec in cluster.driver.read(possible_news_content_ids_table):
        possible_news_content_ids.add(str(rec["ContentGroupID"]))
    print "Possible news videos count {}.".format(len(possible_news_content_ids))

    current_ts = int(time())
    old_schedule_part = "//tmp/msvvitaly/news_schedule_old_part_" + str(current_ts)
    last_schedule_ts = get_old_schedule_part(cluster, args.table, old_schedule_part, FUTURE_GAP, current_ts)

    new_schedule_end_ts = current_ts + args.schedule_gap
    new_schedule_part, new_schedule_end_ts = make_news_schedule(news_by_rubric, last_schedule_ts,
                                                           new_schedule_end_ts,
                                                           args.source_ids_white_list,
                                                           possible_news_content_ids,
                                                           rubric_time_limit,
                                                           args.ads_period)
    new_schedule_part_table = "//tmp/msvvitaly/news_schedule_new_part_" + str(current_ts)
    cluster.driver.write(new_schedule_part_table, new_schedule_part)

    job = cluster.job()

    new_schedule_part_joined = job.table(ACTUAL_URLS).map(map_au, files=nfi_common) \
       .project(ne.all(), content_id=ne.custom(lambda x : str(x), 'ContentGroupID')) \
       .join(job.table(new_schedule_part_table), by='content_id') \
       .project(ne.all(exclude=('pageId', 'yatvChannelPageId')),
                pageId=ne.custom(lambda x, y : x if y else '', 'pageId', 'with_ads'),
                yatvChannelPageId=ne.custom(lambda x, y : x if y else '', 'yatvChannelPageId', 'with_ads')) \
       .map(make_event_id(args.channel_id), files=nfi_common)

    job.concat(job.table(old_schedule_part), new_schedule_part_joined) \
       .sort('begin') \
       .put(new_schedule_part_table + "_merged")
    job.run()

    if cluster.driver.exists(args.table):
        cluster.driver.remove(args.table)
    cluster.driver.copy(new_schedule_part_table + "_merged", args.table)

if __name__ == '__main__':
    main()
