#!/usr/bin/env python
# -*- coding: utf-8 -*-
# from __future__ import unicode_literals
from __future__ import division
import sys
import os
import codecs
import argparse
import json

from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    clusters,
    Record
)

from qb2.api.v1 import extractors as se, filters as sf

import nile
from nile.utils.misc import coerce_path
# import libra
import datetime
import uatraits
import urllib
import re
import random
import urlparse
import getpass
import subprocess

username = getpass.getuser()

date_format = '%Y-%m-%d'


class GetProperty(object):

    def __init__(self, prop):
        self.prop = prop

    def __call__(self, val):
        try:
            obj = json.loads(val)
            assert isinstance(obj, dict)
        except:
            return '-'

        return obj.get(self.prop, 'None').encode('utf-8')


schema = [
    {"type": "string", "name": "AppID"},
    {"type": "string", "name": "app_platform"},
    {"type": "string", "name": "autoplay"},
    {"type": "string", "name": "date"},
    {"type": "string", "name": "device_id"},
    {"type": "string", "name": "event_datetime"},
    {"type": "string", "name": "event_name"},
    {"type": "string", "name": "frame_url"},
    {"type": "string", "name": "page_url"},
    {"type": "string", "name": "raw_event_value"},
    {"type": "string", "name": "session_type"},
    {"type": "string", "name": "src_url"},
]


class GetHRPath(object):

    def __init__(self, cluster):
        self.cluster = cluster

    def __call__(self, path):
        path = str(
            coerce_path(
                path
            ).eval(**self.cluster.environment.templates)
        )
        if not path.startswith('//'):
            return '//' + path
        return path


def date_range(from_, to_):
    if isinstance(from_, basestring):
        from_ = datetime.datetime.strptime(from_, '%Y-%m-%d').date()
    if isinstance(to_, basestring):
        to_ = datetime.datetime.strptime(to_, '%Y-%m-%d').date()
    mvr = min([from_, to_])
    result = []
    while mvr <= max([from_, to_]):
        result.append(mvr)
        mvr += datetime.timedelta(days=1)
    if to_ < from_:
        result = result[::-1]
    return result


get_hr_path = None


def det_dt(edt):
    return edt.split(' ')[0] + '_' + edt.split(' ').split(':')[0]


def yandex_check(url):
    try:
        tldexract
        idna
    except:
        import tldextract
        import idna
    tld = tldextract.extract(url)
    if tld.domain != 'yandex':
        return False
    if tld.suffix not in {'ru', 'ua', 'com.tr'}:
        return False
    return True


def extract_host(url):
    try:
        tldexract
        idna
    except:
        import tldextract
        import idna
    tld = tldextract.extract(url)
    return tld.registered_domain


def extract_autoplay(raw):
    try:
        j = json.loads(raw)
    except:
        return False
    return j.get('autoplay', '') == 'on'


def get_2n_category(n):
    assert isinstance(n, int)
    for x in range(1, 21):
        if n <= (2 ** x):
            return 2 ** x
    return 2 ** 20


def extract_reduce(groups):
    for key, records in groups:
        total = 0
        src_urls = set()
        for rec in records:
            total += 1
            if len(src_urls) <= 1000:
                src_urls.add(rec.src_url)
        result = vars(key)
        result['host'] = rec.host
        result['frame_url'] = rec.frame_url
        result['canon_url'] = rec.canon_url
        result['page_url'] = rec.page_url
        result['total'] = total
        result['src_urls_count'] = len(src_urls)
        result['src_urls'] = sorted(src_urls)[:5]
        yield Record(**result)


def pornhub_classifier(url):
    parsed = urlparse.urlparse(url)
    if (
        parsed.path.startswith('/view_video.php') and
        parsed.query.startswith('viewkey')
    ):
        return url
    else:
        return False


def xv_classifier(url):
    parsed = urlparse.urlparse(url)
    if parsed.path.startswith('/video'):
        return url
    else:
        return False


def ok_classifier(url):
    parsed = urlparse.urlparse(url)
    if parsed.path.startswith('/video'):
        return url
    else:
        qs = urlparse.parse_qs(parsed.query)
        if 'st.mvId' in qs and qs['st.mvId']:
            return 'https://ok.ru/video/{}'.format(qs['st.mvId'][0])
    return False


def extract_video_part(url):
    try:
        return re.search(r'video\-?[0-9]+_[0-9]+', url).group(0)
    except AttributeError:
        return


def vk_classifier(url):
    video_part = extract_video_part(url)
    if video_part:
        return 'https://vk.com/{}'.format(video_part)
    parsed = urlparse.urlparse(url)
    qs = urlparse.parse_qs(parsed.query)
    if parsed.path in {'/al_feed.php', '/feed'}:
        if 'z' in qs and 'video-' in qs['z'][0]:
            video_part = extract_video_part(qs['z'][0])
            if not video_part:
                return False
            return 'https://vk.com/{}'.format(video_part)
        return False
    elif parsed.path.startswith('/video'):
        if parsed.path.startswith('/video_ext'):
            if qs.get('oid') and qs.get('id'):
                return 'https://vk.com/video{}_{}'.format(
                    qs['oid'][0], qs['id'][0]
                )
            return False
        elif re.search(r'^/video[0-9]', parsed.path):
            return url
        else:
            return False
    elif parsed.path == '/search':
        if 'z' in qs and 'video-' in qs['z'][0]:
            video_part = extract_video_part(qs['z'][0])
            if not video_part:
                return False
            return 'https://vk.com/{}'.format(video_part)
        return False
    else:
        return False


def yt_classifier(url):
    parsed = urlparse.urlparse(url)
    if parsed.path == '/watch':
        return url
    else:
        return


host_dict = {
    'vk.com': vk_classifier,
    'youtube.com': yt_classifier,
    'xnxx.com': xv_classifier,
    'xvideos.com': xv_classifier,
    'pornhub.com': pornhub_classifier,
    'ok.ru': ok_classifier
}


def url_classifier(url, host=None):
    try:
        tldexract
        idna
    except:
        import tldextract
        import idna
    if not host:
        host = tldextract.extract(url).registered_domain
    if 'embed' in url:
        return None
    if host not in host_dict:
        return url
    return host_dict[host](url)


def gemini_canonize(url):
    try:
        resp = json.loads(subprocess.check_output(
            ['./geminicl', '-t', 'video', '--url', url]
        ))
    except subprocess.CalledProcessError:
        return url
    if "CanonizedUrl" in resp["Response"]:
        return resp["Response"]["CanonizedUrl"]
    return url


def process_ybro_plays(records):
    try:
        tldexract
        idna
    except:
        import tldextract
        import idna
    for rec in records:
        canon_url = rec.canon_url
        page_url = rec.page_url
        if 'yandex.ru/video' in page_url:
            continue
        frame_url = rec.frame_url
        src_url = rec.src_url
        host = extract_host(canon_url)
        if not host:
            continue
        frame_host = extract_host(frame_url)
        canon_url = url_classifier(canon_url, host=host)
        if not rec.page_url.startswith(('http://', 'https://')):
            continue
        if not canon_url:
            continue
        autoplay = extract_autoplay(rec.raw_event_value)
        yield Record(
            autoplay=autoplay,
            canon_url=canon_url,
            page_url=page_url,
            frame_url=frame_url,
            src_url=src_url,
            host=host,
            frame_host=frame_host
        )


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--cluster', default='Hahn')
    parser.add_argument('--job_root', default='home/videolog/sbr')
    parser.add_argument(
        '--blacklist', default='home/videolog/sbr/urls_blacklist'
    )
    parser.add_argument('--in', default='home/videolog/sbr/ybro_video_plays')
    parser.add_argument('--local', action='store_true')
    parser.add_argument('--lines', type=int, default=80000)
    parser.add_argument('--outfile', default='output.txt')
    parser.add_argument('--gemini', default='home/videolog/sbr/geminicl')
    parser.add_argument(
        '--pool', default=username
    )
    args = parser.parse_args()

    cluster = getattr(clusters, args.cluster)(
        pool=args.pool, token=os.environ['YT_TOKEN']
    ).env(
        templates=dict(job_root=args.job_root),
        parallel_operations_limit=10
    )
    get_hr_path = GetHRPath(cluster)

    if not args.local:
        in_ = json.load(open(getattr(args, 'in')))['table']
    else:
        in_ = getattr(args, 'in')

    if not cluster.driver.exists(get_hr_path(in_)):
        sys.stderr.write('Input table {} does not exist.\n'.format(
            in_
        ))
        sys.exit(1)

    cwd = os.path.abspath(os.getcwd())
    job = cluster.job().env(
        packages=['tldextract', 'idna', 'requests_file'], package_paths=[cwd]
    )

    blacklist = job.table(
        args.blacklist
    )

    job.table(
        in_
    ).map(
        process_ybro_plays,
        # files=[
        #     nile.files.RemoteFile(
        #         args.gemini,
        #         filename='geminicl',
        #         executable=True
        #     )
        # ]
    ).groupby(
        # 'autoplay', 'frame_url', 'page_url', 'host', 'frame_host'
        'autoplay', 'canon_url'
    ).reduce(
        extract_reduce
    ).project(
        ne.all(), cat2n=ne.custom(get_2n_category, 'total')
    ).join(
        blacklist,
        by_left="canon_url", by_right="page_url",
        type="left_only"
    ).sort('total').put(
        '$job_root/urls_aggregated'
    ).groupby(
        'cat2n'
    ).aggregate(
        total=na.count()
    ).sort('total').put(
        '$job_root/cat2n_stats'
    )

    job.run()

    categories = cluster.read('$job_root/cat2n_stats')

    cat_dict = {}
    for rec in categories:
        cat_dict[rec.cat2n] = rec.total

    target_number = args.lines

    records_by_cat = {}

    cat_left = len(categories)
    for cat in sorted(cat_dict, key=lambda x: cat_dict[x]):
        ask = target_number // cat_left + 1
        if cat_dict[cat] < ask:
            ask = cat_dict[cat]
        records_by_cat[cat] = ask
        target_number -= ask
        cat_left -= 1

    job = cluster.job()

    to_concat = []
    for cat in records_by_cat:
        to_concat.append(job.table(
            '$job_root/urls_aggregated'
        ).filter(
            nf.equals('cat2n', cat), memory_limit=16384
        ).random(
            count=records_by_cat[cat], memory_limit=16384
        ))

    job.concat(*to_concat).put(
        '$job_root/pool'
    )

    job.run()

    with codecs.open(
        args.outfile, 'w', 'utf8'
    ) as f:
        f.write(
            json.dumps({
                "cluster": args.cluster.lower(),
                "table": get_hr_path('$job_root/pool')
            })
        )


if __name__ == "__main__":
    main()
