#!/usr/bin/env python
# -*- coding: utf-8 -*-
# from __future__ import unicode_literals
from __future__ import division
import sys
import os
import codecs
import argparse
import nile
from nile.api.v1 import (
    clusters,
    filters as nf,
    extractors as ne,
    aggregators as na,
    statface as ns,
    Record
)
import getpass
import datetime
import itertools
import requests
import time
import tldextract
from pytils import yt_get_date_from_table
from collections import defaultdict, Counter
import copy
import json
import urlparse
import re
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from sessions import get_channels, get_programs

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

whitelist = {}


bad_prefices = ('www.', 'm.')

job_root = '//home/videolog/mma-1559'

RE_SCHEME = r'^https?://(www\.)?'

re_scheme = re.compile(RE_SCHEME)


def proc_url(url):
    return re_scheme.sub("", url)


def normalize_str(s):
    s = s.lower()
    s = re.sub(ur'\(.+\)', '', s)
    s = re.sub(ur'[^a-zа-яё0-9 ]', '', s)
    s = re.sub(ur' +', ' ', s)
    s = s.strip()
    return s


def get_host(url, strip_tld=False):
    if not url.startswith(('http://', 'https://')):
        url = 'http://{}'.format(url)
    parsed = urlparse.urlparse(url)
    result = parsed.netloc
    while result.startswith(bad_prefices):
        for prefix in bad_prefices:
            if result.startswith(prefix):
                result = result[len(prefix):]
    if strip_tld:
        result = '.'.join(result.split('.')[:-1])
    return result


def extract_host(url):
    return tldextract.extract(url).registered_domain


def tryfloat(x):
    try:
        return float(x)
    except (TypeError, ValueError):
        return 0


class GetPlayerPosition(object):
    def __init__(self, ptts):
        self.ptts = ptts

    def __call__(self, groups):
        import libra
        ptts = self.ptts
        for key, recs in groups:
            uid = key.key

            try:
                s = libra.ParseSession(recs, './blockstat.dict')
            except Exception as e:
                continue

            for r in s:
                if (
                    r.IsA('TYandexVideoRequest') or
                    r.IsA('TYandexVideoMordaRequest') or
                    r.IsA('TYandexRelatedVideoRequest')
                ):
                    ui = 'desktop video'
                elif (
                    r.IsA('TTouchYandexVideoRequest') or
                    r.IsA('TTouchYandexVideoPortalRequest') or
                    r.IsA('TTouchYandexRelatedVideoRequest')
                ):
                    ui = 'touch video'
                elif (
                    r.IsA('TPadYandexVideoRequest') or
                    r.IsA('TPadYandexVideoPortalRequest') or
                    r.IsA('TPadYandexRelatedVideoRequest')
                ):
                    ui = 'pad video'
                elif (
                    r.IsA('TMobileAppYandexVideoRequest') or
                    r.IsA('TMobileAppYandexVideoPortalRequest') or
                    r.IsA('TMobileAppYandexRelatedVideoRequest')
                ):
                    ui = 'app video'
                else:
                    continue

                if r.ServiceDomRegion != 'ru':
                    continue

                q = r.Query.decode('utf8')
                qnorm = normalize_str(q)
                if not qnorm or not q:
                    continue
                ts = r.Timestamp

                if qnorm not in ptts:
                    continue

                right_programs = [
                    x for x in ptts[qnorm]
                    if x['ts_range'][0] <= ts <= x['ts_range'][1]
                ]

                if not right_programs:
                    continue

                right_urls = set()
                for x in right_programs:
                    right_urls |= set(whitelist[x['channel']])

                good_url_position = 0
                all_urls = {}
                first_url = ''
                position = -1
                for bl in r.GetMainBlocks():
                    result = bl.GetMainResult()
                    if not result.IsA("TVideoResult"):
                        continue
                    position += 1
                    url = result.Url
                    processed = proc_url(url.split('?')[0])
                    all_urls[
                        str(result.Position)
                    ] = result.Url
                    if processed in right_urls and not good_url_position:
                        good_url_position = result.Position + 1
                    if position == 0:
                        first_url = result.Url
                    all_urls[position] = result.Url
                if not good_url_position:
                    good_url_position = -1
                channel = right_programs[0]['channel']
                yield Record(
                    query=qnorm,
                    good_url_position=good_url_position,
                    channel=channel,
                    ts=ts,
                    first_url=first_url
                )


def strip_y(y):
    if y.startswith('y'):
        return y[1:]
    return y


def make_program_to_ts(p, date, whitelist, extended=False):
    start = int(date.strftime('%s'))
    full_date_tup = (start, start + 86399)
    result = defaultdict(list)
    for ch in p:
        for pr in p[ch]['programs']:
            if pr.get('blacked') or 0:
                continue
            pt = pr['program_title']
            tup = (pr['start_time'], pr['end_time'])
            norm = normalize_str(pr['program_title'])
            channel_name = p[ch]['info']['title']
            result[norm].append({
                'name': pt,
                'normalized_name': norm,
                'content_id': pr['content_id'],
                'ts_range': tup,
                'channel': channel_name
            })
            if extended:
                for n in (
                    u'{}'.format(channel_name),
                    u'{} смотреть онлайн'.format(channel_name),
                    u'{} {}'.format(pt, channel_name),
                    u'{} {} смотреть онлайн'.format(pt, channel_name),
                    u'{} смотреть онлайн'.format(pt),
                ):
                    nn = normalize_str(n)
                    result[nn].append({
                        'name': n,
                        'normalized_name': nn,
                        'content_id': pr['content_id'],
                        'ts_range': tup,
                        'channel': channel_name
                    })
    if not extended:
        for ch in whitelist:
            result[normalize_str(ch)].append(
                {
                    'name': ch,
                    'normalized_name': normalize_str(ch),
                    'channel': ch,
                    'ts_range': full_date_tup
                }
            )
    # result[u'нтв'].append({
    #     'name': u'нтв',
    #     'normalized_name': u'нтв',
    #     'channel': 'ntv_cv',
    #     'ts_range': full_date_tup
    # })
    # result[u'пятый канал'].append({
    #     'name': u'пятый канал',
    #     'normalized_name': u'пятый канал',
    #     'channel': 'ch5_cv',
    #     'ts_range': full_date_tup
    # })
    # result[u'тнт'].append({
    #     'name': u'тнт',
    #     'normalized_name': u'тнт',
    #     'channel': 'tnt',
    #     'ts_range': full_date_tup
    # })
    # result[u'матч'].append({
    #     'name': u'матч',
    #     'normalized_name': u'матч',
    #     'channel': 'match',
    #     'ts_range': full_date_tup
    # })
    # result[u'стс'].append({
    #     'name': u'стс',
    #     'normalized_name': u'стс',
    #     'channel': 'ctc',
    #     'ts_range': full_date_tup
    # })
    return result


def totalize(records):
    for rec in records:
        for comb in itertools.product(
            (rec.query, '_total_'),
            (rec.channel, '_total_')
        ):
            yield Record(
                query=comb[0],
                channel=comb[1],
                good_url_position=rec.good_url_position
            )


class TotalReduce(object):
    def __init__(self, date):
        self.date = date

    def __call__(self, groups):
        for key, records in groups:
            c = Counter()
            result = key.to_dict()
            result['fielddate'] = self.date.strftime('%Y-%m-%d')
            for rec in records:
                c[rec.good_url_position] += 1
            for threshold in [1, 3, 5, 10]:
                value = sum(
                    c[x] for x in c if 0 <= x <= threshold
                ) / sum(c.values())
                result['top{}'.format(threshold)] = value
            result['count'] = sum(c.values())
            yield Record(**result)


def process_table(cluster, table, date, report):

    report_table = '{}/{}/report'.format(job_root, date)

    programs_ = get_programs(date.strftime('%Y-%m-%d'))
    programs = {
        k: v for k, v in programs_.items() if v['info']['title'] in whitelist
    }
    ptts = make_program_to_ts(programs, date, whitelist)

    job = cluster.job()
    us = job.table(table)

    us.groupby(
        'key'
    ).sort(
        'subkey'
    ).reduce(
        GetPlayerPosition(ptts),
        files=[
            nile.files.RemoteFile('statbox/statbox-dict-last/blockstat.dict'),
            nile.files.RemoteFile('statbox/resources/libra.so')
        ],
        memory_limit=4000
    ).put(
        '{}/{}/full'.format(job_root, date)
    ).map(
        totalize
    ).groupby(
        'query', 'channel'
    ).reduce(
        TotalReduce(date)
    ).sort(
        'fielddate', 'query', 'channel'
    ).put(
        report_table
    )

    job.run()

    client = ns.StatfaceClient(
        proxy='upload.stat.yandex-team.ru',
        username=os.environ['STAT_LOGIN'],
        password=os.environ['STAT_TOKEN']
    )

    ns.StatfaceReport().path(
        report
    ).scale('daily').replace_mask(
        'fielddate'
    ).client(
        client
    ).data(
        cluster.read(report_table)
    ).publish()


def get_last_date(args):
    report = args.report
    headers = {
        'StatRobotUser': os.environ['STAT_LOGIN'],
        'StatRobotPassword': os.environ['STAT_TOKEN']
    }
    req = requests.get(
        'https://stat.yandex-team.ru/{}?_type=json'.format(report),
        headers=headers, verify=False
    )
    values = sorted(
        req.json()['values'], key=lambda x: x['fielddate'], reverse=True
    )
    return yt_get_date_from_table(
        values[0]['fielddate'].split(' ')[0]
    )


def check_channel(channel):
    status = channel.get('status', [])
    return (
        u'published' in status and
        (u'has_schedule' in status or u'hide_schedule' in status)  # and
        # u'has_cachup' in status
    )


def main():
    global whitelist
    parser = argparse.ArgumentParser()
    parser.add_argument('--pool')
    parser.add_argument('--whitelist', default='channels_whitelist.json')
    parser.add_argument('--from')
    parser.add_argument('--to')
    parser.add_argument(
        '--report', default='Video/Others/live_offline'
    )
    args = parser.parse_args()

    report = args.report

    kwargs = {'token': os.environ['YT_TOKEN']}
    if args.pool:
        kwargs['pool'] = args.pool

    cluster = clusters.yt.Hahn(**kwargs).env(
        templates=dict(
            job_root=job_root,
        ),
        # parallel_operations_limit=10
    )

    # whitelist.update(json.load(open(args.whitelist)))
    channels_ = get_channels()
    channels = [x for x in channels_ if check_channel(x)]
    content_ids = {x['content_id'] for x in channels}
    job = cluster.job()

    job.table(
        '//home/video-hosting/base/ContentGroup'
    ).filter(
        nf.custom(lambda x: x in content_ids, 'UUID')
    ).put(
        '$job_root/test'
    )

    job.run()

    channel_info = [rec.to_dict() for rec in cluster.read('$job_root/test')]

    whitelist = {
        x['Name'].decode('utf8'): {
            u"frontend.vh.yandex.ru/player/{}".format(x['ContentGroupID']),
            u"frontend.vh.yandex.ru/player/{}".format(x['UUID']),
        } for x in channel_info
    }


    from_ = getattr(args, 'from')
    to_ = getattr(args, 'to')
    if from_ and to_:
        tables = cluster.driver.client.search(
            root='//user_sessions/pub/search/daily',
            path_filter=lambda x: x.endswith('/clean') and
            yt_get_date_from_table(
                from_
            ) <= yt_get_date_from_table(x) <= yt_get_date_from_table(to_)
        )
    else:
        last_date = get_last_date(args)
        tables = cluster.driver.client.search(
            root='//user_sessions/pub/search/daily',
            path_filter=lambda x: x.endswith('/clean') and
            yt_get_date_from_table(x) > last_date
        )

    for table in tables:
        date = yt_get_date_from_table(table)
        process_table(cluster, table, date, report)


if __name__ == "__main__":
    main()
