#!/usr/bin/env python
# -*- coding: utf-8 -*-
# from __future__ import unicode_literals
from __future__ import division
import sys
import os
import codecs
import argparse
import nile
from nile.api.v1 import (
    clusters,
    filters as nf,
    extractors as ne,
    aggregators as na,
    statface as ns,
    Record
)
import getpass
import datetime
import requests
import time
import tldextract
from pytils import yt_get_date_from_table
from collections import defaultdict, Counter
import copy
import json
import urlparse
import re
from requests.packages.urllib3.exceptions import InsecureRequestWarning

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

bad_prefices = ('www.', 'm.')


RE_SCHEME = r'^https?://(www\.)?'

re_scheme = re.compile(RE_SCHEME)


def proc_url(url):
    return re_scheme.sub("", url)


def get_host(url, strip_tld=False):
    if not url.startswith(('http://', 'https://')):
        url = 'http://{}'.format(url)
    parsed = urlparse.urlparse(url)
    result = parsed.netloc
    while result.startswith(bad_prefices):
        for prefix in bad_prefices:
            if result.startswith(prefix):
                result = result[len(prefix):]
    if strip_tld:
        result = '.'.join(result.split('.')[:-1])
    return result


def extract_host(url):
    return tldextract.extract(url).registered_domain


def tryfloat(x):
    try:
        return float(x)
    except (TypeError, ValueError):
        return 0


def get_vt_and_classifiers(groups):
    import libra
    for key, recs in groups:
        uid = key.key

        try:
            s = libra.ParseSession(recs, './blockstat.dict')
        except Exception as e:
            continue

        for r in s:
            if (
                r.IsA('TYandexVideoRequest') or
                r.IsA('TYandexVideoMordaRequest') or
                r.IsA('TYandexRelatedVideoRequest')
            ):
                ui = 'desktop video'
            elif (
                r.IsA('TTouchYandexVideoRequest') or
                r.IsA('TTouchYandexVideoPortalRequest') or
                r.IsA('TTouchYandexRelatedVideoRequest')
            ):
                ui = 'touch video'
            elif (
                r.IsA('TPadYandexVideoRequest') or
                r.IsA('TPadYandexVideoPortalRequest') or
                r.IsA('TPadYandexRelatedVideoRequest')
            ):
                ui = 'pad video'
            elif (
                r.IsA('TMobileAppYandexVideoRequest') or
                r.IsA('TMobileAppYandexVideoPortalRequest') or
                r.IsA('TMobileAppYandexRelatedVideoRequest')
            ):
                ui = 'app video'
            else:
                continue

            if r.ServiceDomRegion != 'ru':
                continue

            q = str(r.Query)
            relev = r.RelevValues
            spv = r.SearchPropsValues

            classifiers = {}
            if 'vserial' in relev:
                try:
                    if relev['vserial'] > 0:
                        classifiers['series'] = 1
                    else:
                        classifiers['series'] = 0
                except:
                    classifiers['series'] = 0
            else:
                classifiers['series'] = 0

            if 'REPORT.entity' in spv:
                if 'Film/Film' in spv['REPORT.entity']:
                    classifiers['film'] = 1
                else:
                    classifiers['film'] = 0
            else:
                classifiers['film'] = 0

            if str(spv.get('VIDEO.VideoPorno.vidprn')) == 'ipq1':
                classifiers['porn'] = 1
            else:
                classifiers['porn'] = 0

            if tryfloat(spv.get(
                'UPPER.ApplyVideoBlender.IntentWeight/VIDEOQUICK'
            )) > 0:
                classifiers['fresh'] = 1
            else:
                classifiers['fresh'] = 0

            for bl in r.GetMainBlocks():
                result = bl.GetMainResult()
                if not result.IsA("TVideoResult"):
                    continue

                url = result.Url
                duration = r.FindVideoDurationInfo(result)
                dt = None
                ht = None
                if duration:
                    dt = min(duration.PlayingDuration, duration.Duration)

                heartbeat = r.FindVideoHeartbeat(result, 'ANY')
                if heartbeat:
                    ht = min(heartbeat.Ticks, heartbeat.Duration)

                res_tvt = max(dt, ht)

                if duration or heartbeat:
                    yield Record(
                        yandexuid=strip_y(key.key),
                        Url=proc_url(url),
                        classifiers=classifiers,
                        vt=res_tvt
                    )


def strip_y(y):
    if y.startswith('y'):
        return y[1:]
    return y


def process_table(cluster, table, yql_token, date):

    tvt_url_table = '$job_root/{}/us_tvt_url'.format(date)
    joined_url_table = '$job_root/{}/us_tvt_joined'.format(date)

    headers = {
        'Content-Type': 'application/json',
        'Authorization': 'OAuth {}'.format(yql_token)
    }
    content_merged_table = 'home/videolog/mma-1118/content_merged'
    req = requests.post(
        'https://yql.yandex.net/api/v2/operations',
        json={
            'content': '$replace = Re2::Replace("{}");\n'
            'use hahn;\n'
            'insert into [{}] with truncate\n'
            'select $replace(Url, "") as Url, ContentType from '
            'Range([home/videoindex/content/state], [0000], [9999], [urls])\n'
            'group by (Url, ContentType) order by Url'.format(
                RE_SCHEME,
                content_merged_table
            ),
            'action': 'RUN',
            'type': 'SQL',
            'title': 'MMA-1118 Content Merge | YQL'
        },
        headers=headers
    )
    id_ = req.json()['id']
    status = req.json()['status']
    tries = 0

    job = cluster.job()
    us = job.table(table)

    us.groupby(
        'key'
    ).sort(
        'subkey'
    ).reduce(
        get_vt_and_classifiers,
        files=[
            nile.files.RemoteFile('statbox/statbox-dict-last/blockstat.dict'),
            nile.files.RemoteFile('statbox/resources/libra.so')
        ],
        memory_limit=4000
    ).groupby(
        'Url'
    ).aggregate(
        tvt=na.sum('vt'),
        classifiers=na.any('classifiers'),
    ).project(
        'Url', 'classifiers', watched=ne.custom(
            lambda x: 1 if x else 0, 'tvt'
        )
    ).put(
        tvt_url_table
    )

    job.run()

    while status in {'PENDING', 'RUNNING'} and tries < 5:
        req = requests.get(
            'https://yql.yandex.net/api/v2/operations/{}'.format(id_),
            headers=headers
        )
        status = req.json()['status']
        time.sleep(5 * 60)
        tries += 1
    if status != 'COMPLETED':
        sys.stderr.write('operation {} failed: {}'.format(id_, req.content))

    job = cluster.job()

    content_merged = job.table(
        content_merged_table
    ).groupby(
        'Url'
    ).aggregate(
        ContentType=na.distinct('ContentType')
    )

    tvt_url = job.table(tvt_url_table)
    top_hosts_table = '$job_root/{}/top_hosts'.format(date)

    tvt_url.project(
        host=ne.custom(get_host, 'Url')
    ).groupby('host').aggregate(
        count=na.count()
    ).put(
        top_hosts_table
    )

    tvt_url.join(
        content_merged, type='left', by='Url'
    ).put(
        joined_url_table
    ).project(
        'Url', 'watched',
        host=ne.custom(get_host, 'Url'),
        classifiers=ne.custom(
            lambda x: json.dumps(x, sort_keys=True), 'classifiers'
        ),
        ContentType=ne.custom(
            lambda x: json.dumps(x, sort_keys=True), 'ContentType'
        ),
    ).groupby(
        'host', 'ContentType', 'classifiers', 'watched'
    ).aggregate(
        count=na.count()
    ).put(
        '$job_root/{}/final'.format(date)
    )

    job.run()

    host_stats = Counter(
        {rec.host: rec.count for rec in cluster.read(top_hosts_table)}
    )

    cluster.write(tvt_url_table, [])

    return host_stats


def get_last_date(args):
    report = args.report
    headers = {
        'StatRobotUser': args.stat_login,
        'StatRobotPassword': args.stat_password
    }
    req = requests.get(
        'https://upload.stat.yandex-team.ru/{}?_type=json'.format(report),
        headers=headers, verify=False
    )
    values = sorted(
        req.json()['values'], key=lambda x: x['fielddate'], reverse=True
    )
    return yt_get_date_from_table(
        values[0]['fielddate'].split(' ')[0]
    )


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--pool')
    parser.add_argument('--from')
    parser.add_argument('--to')
    parser.add_argument(
        '--report', default='Video/Others/content_coverage_by_user_sessions'
    )
    parser.add_argument('--stat_login', '-sl')
    parser.add_argument('--stat_password', '-sp')
    args = parser.parse_args()

    report = args.report

    kwargs = {'token': os.environ['YT_TOKEN']}
    if args.pool:
        kwargs['pool'] = args.pool

    job_root = 'home/videolog/mma-1287'
    cluster = clusters.yt.Hahn(**kwargs).env(
        templates=dict(
            job_root=job_root,
        ),
        # parallel_operations_limit=10
    )

    from_ = getattr(args, 'from')
    to_ = getattr(args, 'to')
    if from_ and to_:
        tables = cluster.driver.client.search(
            root='//user_sessions/pub/search/daily',
            path_filter=lambda x: x.endswith('/clean') and
            yt_get_date_from_table(
                from_
            ) <= yt_get_date_from_table(x) <= yt_get_date_from_table(to_)
        )
    else:
        last_date = get_last_date(args)
        tables = cluster.driver.client.search(
            root='//user_sessions/pub/search/daily',
            path_filter=lambda x: x.endswith('/clean') and
            yt_get_date_from_table(x) > last_date
        )

    for table in tables:
        date = str(yt_get_date_from_table(table))
        data_table = '{}/{}/final'.format(job_root, date)
        host_stats_table = '{}/{}/top_hosts'.format(job_root, date)
        if cluster.driver.exists(data_table) and cluster.driver.exists(
            host_stats_table
        ):
            host_stats = Counter(
                {rec.host: rec.count for rec in cluster.read(host_stats_table)}
            )
        else:
            print('processing table {}'.format(table))
            host_stats = process_table(
                cluster,
                table,
                os.environ['YQL_TOKEN'],
                date
            )
        top50hosts = {x[0] for x in host_stats.most_common(50)}

        recs = cluster.read(data_table)

        by_key = defaultdict(lambda: Counter())
        for rec in recs:
            classifiers = json.loads(rec.classifiers)
            if not classifiers:
                continue
            good_classifiers = [x for x in classifiers if classifiers[x]]
            if not good_classifiers:
                good_classifiers = ['other']
            good_classifiers.append('_total_')
            host = rec.host
            if host not in top50hosts:
                host = '_other_'
            for classifier in good_classifiers:
                for watched in [rec.watched, '_total_']:
                    for host_ in [host, '_total_']:
                        key_ = (classifier, watched, host_)
                        by_key[key_][rec.ContentType] += rec.count
        result = []
        for key in by_key:
            rec = {
                'fielddate': date,
                'category': key[0],
                'watched': key[1],
                'host': key[2]
            }
            total = sum(by_key[key].values())
            new = copy.deepcopy(rec)
            new['content_type'] = 'audio'
            new['value'] = (
                (
                    by_key[key]['[\"EAudio\"]'] +
                    by_key[key]['[\"EVideo\", \"EAudio\"]']
                ) / total
            )
            result.append(new)
            new = copy.deepcopy(rec)
            new['content_type'] = 'video'
            new['value'] = (
                (
                    by_key[key]['[\"EVideo\"]'] +
                    by_key[key]['[\"EVideo\", \"EAudio\"]']
                ) / total
            )
            result.append(new)
            new = copy.deepcopy(rec)
            new['content_type'] = 'audio+video'
            new['value'] = (
                (
                    by_key[key]['[\"EVideo\", \"EAudio\"]']
                ) / total
            )
            result.append(new)

        client = ns.StatfaceClient(
            proxy='upload.stat.yandex-team.ru',
            username=args.stat_login,
            password=args.stat_password
        )

        ns.StatfaceReport().path(
            report
        ).scale('daily').replace_mask(
            'fielddate'
        ).client(
            client
        ).data(
            result
        ).publish()


if __name__ == "__main__":
    main()
