from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    cli,
    with_hints,
    clusters,
    Record
)
from qb2.api.v1 import (
    extractors as se,
    filters as sf
)
from qb2.api.v1 import typing as qt

import nile
import json
import sys
import urlparse
import datetime


SOCIAL_HOSTS = [
    "ok.ru",
    "m.ok.ru",
    "vk.com",
    "m.vk.com",
    "my.mail.ru",
    "rutube.ru",
    ]

LEGAL_HOSTS = [
    "yandex.ru",
    "zen.yandex.ru",
    "afisha.yandex.ru",
    "browser.yandex.ru",
    "frontend.vh.yandex.ru",
    "yastatic.net",
    "google.com",
    "google.ru",
    "drive.google.com",
    "photos.google.com",
    "facebook.com",
    "web.facebook.com",
    "twitter.com",
    "coub.com",
    "ivi.ru",
    "twitch.tv",
    "m.twitch.tv",
    "worldoftanks.ru",
    "instagram.com",
    "kino.mail.ru",
    "tv.mail.ru",
    "otvet.mail.ru",
    "cloud.mail.ru",
    "e.mail.ru",
    "la.mail.ru",
    "r.mail.ru",
    "youtubekids.com",
    "kids.youtube.com",
    "rutube.ru",
    "kinopoisk.ru",
    "hd.kinopoisk.ru",
    "amediateka.ru",
    "premier.one",
    "ctc.ru",
    "ntv.ru",
    "1tv.ru",
    "kino.1tv.ru",
    "mk.ru",
    "rbc.ru",
    "ria.ru",
    "iz.ru",
    "ren.tv",
    "radiomayak.ru",
    "ntvplus.tv",
    "russia.tv",
    "live.russia.tv",
    "tvzvezda.ru",
    "domashniy.ru",
    "championat.com",
    "sputnik.by",
    "tut.by",
    "news.tut.by",
    "nn.ru",
    "megogo.ru",
    "megafon.tv",
    "netflix.com",
    "vimeo.com",
    "okko.tv",
    "more.tv",
    "tvzavr.ru",
    ]


def clean(url):
    url = url.replace('https://', '')
    url = url.replace('http://', '')
    url = url.replace('www.', '')
    if url == '':
        return ''
    if url[len(url)-1] == '/':
        url = url[:-1]
    return url


@with_hints(output_schema=dict(yandexuid=str, url=str, viewtime=float, duration=float, src_url=str, cat=str))

def parse_bs(recs):
    for rec in recs:
        try:
            parsed_vc = json.loads(rec.vc)
        except:
            continue
        for elem in parsed_vc:
            if 'data' in elem.keys() and 'url' in elem.keys():
                url = elem['url']
                for video_info in elem['data']:
                    guid = video_info['uid']
                    viewtime = video_info.get('total_played_duration', 0)
                    duration = video_info.get('duration', 0)
                    if duration == 'live':
                        duration = -1
                    src_url = video_info['media_url']
            else:
                url = elem.keys()[0]
                try:
                    p = elem[url].get('p')
                except:
                    continue
                for video_info in p:
                    guid = video_info[0]
                    cor = 0
                    if len(str(guid)) <= 1:
                        guid = None
                        cor = -1
                    viewtime = video_info[5 + cor]
                    duration = video_info[4 + cor]
                    if duration == 'live':
                        duration = -1
                    src_url = video_info[9 + cor]

            if (url.startswith("http://") or url.startswith("https://")) and viewtime > 60*20:
                host = clean(urlparse.urlparse(url).netloc.lower())
                embed_host = clean(urlparse.urlparse(src_url.replace("blob:", "")).netloc.lower())
                if host == "youtube.com" or embed_host == "youtube.com":
                    continue
                if host in SOCIAL_HOSTS:
                    if "ok.ru/messages" in url or "vk.com/im" in url or "vk.com/feed" in url or ("ok.ru/profile/" in url and "/album/" in url):
                        continue
                    cat = "social"
                elif host not in LEGAL_HOSTS and embed_host not in LEGAL_HOSTS and embed_host not in SOCIAL_HOSTS:
                    cat = "pirate"
                else:
                    continue
                yield Record(yandexuid=rec.yandexuid, url=url, viewtime=viewtime, duration=duration, src_url=src_url, cat=cat)


@cli.statinfra_job
def make_job(job, options, statface_client):

    job = job.env(
        yt_spec_defaults=dict(pool_trees=["physical"], tentative_pool_trees=["cloud"]),
        templates=dict(job_root='//home/videolog/itajn/pbr'))

    enddate = datetime.datetime.strptime(options.dates[0], '%Y-%m-%d')
    startdate = enddate - datetime.timedelta(6)
    table_path = 'statbox/bar-navig-log/{%s..%s}' %(startdate.strftime("%Y-%m-%d"), enddate.strftime("%Y-%m-%d"))

    prepared_recs = job.table(table_path,  weak_schema=dict(ip_numeric=qt.Optional[qt.String])
                             ).qb2(log='bar-navig-log',
                                   fields=['yandexuid',
                                           se.dictitem('decoded_vc', from_='parsed_http_params'),
                                           se.custom('vc', lambda x: x[0] if x else None, 'decoded_vc').with_type(qt.Json)
                                          ],
                                   filters=[sf.defined('vc', 'yandexuid'),
                                            sf.equals('yasoft', 'yabrowser'),
                                            sf.region_belongs([225], field='geo_id')]
                                  ).map(parse_bs, memory_limit=4000
                                       ).put('$job_root/precalculated')

    return job

if __name__ == '__main__':
    cli.run()
