#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import os
import codecs
import argparse
import json
import time
from collections import defaultdict
import datetime
import requests
import re


def retry_request(request_type, args=None, kwargs=None):
    if not args:
        args = []
    if not kwargs:
        kwargs = {}
    req = None
    retries = 0
    while (req is None or req.status_code >= 300) and retries < 10:
        try:
            req = getattr(requests, request_type)(*args, **kwargs)
        except Exception as e:
            time.sleep(60)
            retries += 1
    return req


def get_channels():
    url = "https://frontend.vh.yandex.ru/channels?geo_id=213&locale=ru&content_type_name=channel"
    resp = retry_request("get", kwargs=dict(url=url))
    return resp.json().get("set") or {}


def get_episodes(parent_id, ts_from, ts_to):
    url = (
        "https://frontend.vh.yandex.ru/episodes"
        "?parent_id={}&end_date__from={}&start_date__to={}"
        "&geo_id=213&locale=ru".format(parent_id, ts_from, ts_to)
    )
    resp = retry_request("get", kwargs=dict(url=url))
    return resp.json().get("set") or {}


def get_episodes_wrapper(channels, now):
    ts_from = (now - datetime.timedelta(hours=1)).strftime("%s")
    ts_to = (now + datetime.timedelta(hours=1)).strftime("%s")
    for channel in channels:
        episodes = get_episodes(channel["content_id"], ts_from, ts_to)
        channel["current_episodes"] = episodes


def check_channel(channel):
    status = channel.get("status", [])
    return u"published" in status and (
        u"has_schedule" in status or u"hide_schedule" in status
    ) and not u"hidden" in status and not channel.get(
        "channel_type"
    ) == u"personal"


def get_uuids_to_cgids(uuids, cluster):
    channels_join_table = "$job_root/channels_join"
    job = cluster.job()

    job.table("//home/video-hosting/base/ContentGroup").filter(
        nf.custom(lambda x: x in uuids, "UUID")
    ).put(channels_join_table)

    job.run()

    return {
        x["UUID"]: x["ContentGroupID"]
        for x in cluster.read(channels_join_table)
    }


def normalize_str(s):
    s = s.lower()
    s = re.sub(ur'\(.+\)', '', s)
    s = re.sub(ur'[^a-zа-яё0-9 ]', '', s)
    s = re.sub(ur' +', ' ', s)
    s = s.strip()
    return s


def generate_ptts(channels, now):
    result = []
    channel_ts_from = int((now - datetime.timedelta(hours=1)).strftime("%s"))
    channel_ts_to = int((now + datetime.timedelta(hours=1)).strftime("%s"))
    for ch in channels:
        ep = ch["current_episodes"][0]
        ep_name = ep["title"]
        ch_name = ch["title"]
        ch_right_urls = [
            u"frontend.vh.yandex.ru/player/{}".format(ch["content_id"]),
            u"frontend.vh.yandex.ru/player/{}".format(ch["deprecated_content_id"]),
        ]
        ep_right_urls = [
            u"frontend.vh.yandex.ru/player/{}".format(ep["content_id"]),
            u"frontend.vh.yandex.ru/player/{}".format(ep["deprecated_content_id"]),
        ]
        right_urls = ch_right_urls + ep_right_urls
        for q in [
            ch_name, u"{} смотреть онлайн".format(ch_name)
        ]:
            result.append({
                "query": normalize_str(q),
                "ts_range": (channel_ts_from, channel_ts_to),
                "right_urls": right_urls,
                "channel": ch["content_id"],
                "channel_name": ch["title"],
                "query_type": "channel"
            })
        for q in [
            ep_name,
            u'{} {}'.format(ep_name, ch_name),
            u'{} {} смотреть онлайн'.format(ep_name, ch_name),
            u'{} смотреть онлайн'.format(ep_name)
        ]:
            result.append({
                "query": normalize_str(q),
                "ts_range": (ep["start_time"], ep["end_time"]),
                "right_urls": right_urls,
                "channel": ch["content_id"],
                "channel_name": ch["title"],
                "episode": ep["content_id"],
                "episode_name": ep["title"],
                "query_type": "episode"
            })
    return result


def main():
    global job_root
    parser = argparse.ArgumentParser()
    parser.add_argument("--raw_data", default="raw_data.json")
    parser.add_argument("--ptts", default="ptts.json")
    parser.add_argument("--queries", default="queries.tsv")
    args = parser.parse_args()

    now = datetime.datetime.now()
    channels = [x for x in get_channels() if check_channel(x)]
    get_episodes_wrapper(channels, now)
    filtered = [
        x for x in channels
        if x["current_episodes"] and len(
            list(filter(
                lambda y: y["start_time"]
                <= int(now.strftime("%s")) <= y["end_time"],
                x["current_episodes"]
            ))
        ) == 1
    ]

    ptts = generate_ptts(filtered, now)
    queries = {x["query"] for x in ptts}

    with codecs.open(args.queries, "w", "utf8") as f:
        f.write(
            u"\n".join(u"{}\t225".format(q) for q in sorted(queries)) + u"\n"
        )

    with codecs.open(args.ptts, "w", "utf8") as f:
        json.dump(
            ptts, f,
            indent=2,
            sort_keys=True,
            ensure_ascii=False,
        )

    with codecs.open(args.raw_data, "w", "utf8") as f:
        json.dump(
            channels, f,
            indent=2,
            sort_keys=True,
            ensure_ascii=False,
        )


if __name__ == "__main__":
    main()
