#!/usr/bin/env python
# -*- coding: utf-8 -*-
# from __future__ import unicode_literals
from __future__ import division
import sys
import os
import codecs
import argparse
import json

from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    clusters,
    Record
)

from qb2.api.v1 import extractors as se, filters as sf

import nile
from nile.utils.misc import coerce_path
# import libra
import datetime
import uatraits
import urllib
import re
import math
import random
import urlparse
import getpass
import tldextract
from collections import defaultdict

username = "videolog"

date_format = '%Y-%m-%d'


class GetProperty(object):

    def __init__(self, prop):
        self.prop = prop

    def __call__(self, val):
        try:
            obj = json.loads(val)
            assert isinstance(obj, dict)
        except:
            return '-'

        return obj.get(self.prop, 'None').encode('utf-8')


re_yandex = re.compile(
    r'yandex\.(.+)/video'
)


schema = [
    {"type": "string", "name": "autoplay"},
    {"type": "string", "name": "yandexuid"},
    {"type": "string", "name": "event_datetime"},
    {"type": "int64", "name": "event_timestamp"},
    {"type": "string", "name": "frame_url"},
    {"type": "string", "name": "page_url"},
    {"type": "string", "name": "src_url"},
    {"type": "string", "name": "raw_event_value"},
    {"type": "string", "name": "referer"},
    {"type": "string", "name": "referer_parsed"},
]


class GetHRPath(object):

    def __init__(self, cluster):
        self.cluster = cluster

    def __call__(self, path):
        path = str(
            coerce_path(
                path
            ).eval(**self.cluster.environment.templates)
        )
        if not path.startswith('//'):
            return '//' + path
        return path


def date_range(from_, to_):
    if isinstance(from_, basestring):
        from_ = datetime.datetime.strptime(from_, '%Y-%m-%d').date()
    if isinstance(to_, basestring):
        to_ = datetime.datetime.strptime(to_, '%Y-%m-%d').date()
    mvr = min([from_, to_])
    result = []
    while mvr <= max([from_, to_]):
        result.append(mvr)
        mvr += datetime.timedelta(days=1)
    if to_ < from_:
        result = result[::-1]
    return result


get_hr_path = None


def det_dt(edt):
    return edt.split(' ')[0] + '_' + edt.split(' ').split(':')[0]


def parse_date(str_):
    return datetime.datetime.strptime(str_, '%Y-%m-%d').date()


def extract_host(url):
    try:
        return tldextract.extract(url).registered_domain
    except TypeError:
        return ""


def percent_decode(url):
    if not isinstance(url, str):
        url = url.encode('utf8', errors='replace')
    url = urlparse.unquote(url)
    return url


def parse_vc(recs):
    for rec in recs:
        try:
            dec = json.loads(rec.dec)
        except:
            continue

        for d in dec:
            url = d.keys()[0]
            try:
                p = d[url]['p']
            except (KeyError, TypeError):
                continue
            if not isinstance(p, list):
                continue
            for el in p:
                if len(el) == 10:
                    src_url = (el[8] or '')
                elif len(el) in {11, 13}:
                    src_url = (el[9] or '')
                else:
                    continue
                if src_url.startswith('blob:'):
                    src_url = src_url[len('blob:'):]
                src_url = percent_decode(src_url)
                event_datetime = None
                try:
                    event_datetime = datetime.datetime.fromtimestamp(
                        int(rec.timestamp)
                    ).strftime('%Y-%m-%d %H:%M:%S')
                except ValueError:
                    pass
                result = dict(
                    page_url=url, frame_url=url, src_url=src_url,
                    raw_event_value="", yandexuid=rec.yandexuid,
                    event_timestamp=rec.timestamp,
                    event_datetime=event_datetime,
                    autoplay='unknown',
                )
                # for i, x in enumerate(el[::-1]):
                #     result["el_{:02}".format(i)] = x
                yield Record(**result)


class VcParser(object):

    def _old_parse(self, dec, rec):
        for d in dec:
            url = d.keys()[0] or ''
            try:
                p = d[url]['p']
            except (KeyError, TypeError):
                continue
            if not isinstance(p, list):
                continue
            for el in p:
                duration = None
                if len(el) == 10:
                    try:
                        play_length = float(el[4])
                    except (TypeError, ValueError):
                        play_length = 0
                    src_url = (el[8] or '')
                elif len(el) == 11:
                    try:
                        play_length = float(el[5])
                    except (TypeError, ValueError):
                        play_length = 0
                    src_url = (el[9] or '')
                elif len(el) == 13:
                    try:
                        play_length = float(el[5])
                    except (TypeError, ValueError):
                        play_length = 0
                    duration = el[4]
                    src_url = (el[9] or '')
                else:
                    continue
                if play_length > 86400:
                    continue
                if len(el) == 13:
                    frame_url = el[11] or ''
                else:
                    frame_url = url or ''
                if src_url.startswith('blob:'):
                    src_url = src_url[len('blob:'):]
                src_url = percent_decode(src_url)
                try:
                    if re_yandex.search(url) or re_yandex.search(
                        str(rec.referer)
                    ):
                        continue
                except Exception:
                    raise Exception(
                        'URL: {}, REFERER: {}'.format(
                            url, rec.referer
                        )
                    )
                if (
                    not url or not frame_url or
                    not url.startswith('http') or
                    not frame_url.startswith('http')
                ):
                    continue
                if len(url) > 300 or len(frame_url) > 300:
                    continue
                if play_length < 30:
                    lvt = 0
                else:
                    lvt = math.log(play_length - 25)
                if not url or not frame_url:
                    continue
                event_datetime = None
                try:
                    event_datetime = datetime.datetime.fromtimestamp(
                        int(rec.timestamp)
                    ).strftime('%Y-%m-%d %H:%M:%S')
                except ValueError:
                    pass
                result = dict(
                    page_url=self._canonize_page_url(url),
                    frame_url=self._canonize_frame_url(frame_url),
                    src_url=src_url,
                    yandexuid=rec.yandexuid,
                    event_timestamp=rec.timestamp,
                    event_datetime=event_datetime,
                    autoplay="unknown"
                )
                if duration:
                    result['duration'] = duration
                # for i, x in enumerate(el[::-1]):
                #     result["el_{:02}".format(i)] = x
                yield Record(**result)

    @staticmethod
    def _canonize_page_url(url):
        return url

    @staticmethod
    def _canonize_frame_url(url):
        return url

    def _new_parse(self, dec, rec):
        for d in dec:
            page_url = d["url"]
            by_uid = defaultdict(lambda: {
                "length": 0,
                "duration": 0,
            })
            for elem in d["data"]:
                dct = by_uid[elem["uid"]]
                dct["length"] += float(elem["played_duration"])
                dct["frame_url"] = elem["frame_url"]
                dct["src_url"] = elem["media_url"]
                if dct["src_url"].startswith('blob:'):
                    dct["src_url"] = dct["src_url"][len('blob:'):]
                dct["src_url"] = percent_decode(dct["src_url"])
                try:
                    dct["duration"] += float(elem["duration"])
                except ValueError:
                    pass
            event_datetime = None
            try:
                event_datetime = datetime.datetime.fromtimestamp(
                    int(rec.timestamp)
                ).strftime('%Y-%m-%d %H:%M:%S')
            except ValueError:
                pass
            for uid in by_uid:
                dct = by_uid[uid]
                yield Record(
                    page_url=self._canonize_page_url(page_url),
                    frame_url=self._canonize_frame_url(dct["frame_url"]),
                    src_url=dct["src_url"],
                    yandexuid=rec.yandexuid,
                    event_timestamp=rec.timestamp,
                    event_datetime=event_datetime,
                    autoplay="unknown"
                )

    def __call__(self, recs):
        for rec in recs:
            if re_yandex.search(str(rec.referer)):
                continue
            try:
                dec = json.loads(rec.dec)
            except (TypeError, ValueError):
                continue
            if "rebuffering_times" in rec.dec:
                for rec_ in self._new_parse(dec, rec):
                    yield rec_
            else:
                for rec_ in self._old_parse(dec, rec):
                    yield rec_


def process_netloc(netloc):
    for prefix in ['www.', 'm.']:
        if netloc.startswith(prefix):
            netloc = netloc[len(prefix):]
    return netloc


def is_yandex(netloc):
    sp = netloc.split('.', 1)
    if len(sp) < 2:
        return False
    a, b = sp
    if a == 'yandex' and b in {'ru', 'ua', 'com.tr', 'by', 'kz'}:
        return True
    return False


def make_good_referer(url):
    p = urlparse.urlparse(url)
    netloc = process_netloc(p.netloc)
    return netloc + '/'.join(p.path.split('/')[:2])


def parse_referer(url):
    if not url:
        return
    try:
        p = urlparse.urlparse(url)
    except ValueError:
        return
    reduced = make_good_referer(url)
    if reduced == 'yandex.ru/clck':
        qs = urlparse.parse_qs(p.query)
        try:
            unquoted = urlparse.unquote(qs['from'][0]).replace(';', '/')
        except KeyError:
            return
        reduced2 = make_good_referer(unquoted)
        return reduced2
    return reduced


def referer_rewrite(records):
    for rec in records:
        try:
            p = urlparse.urlparse(rec.page_url)
        except AttributeError:
            continue
        netloc = process_netloc(p.netloc)
        if not is_yandex(netloc):
            yield rec
            continue
        referer = make_good_referer(rec.page_url)
        vrs = vars(rec)
        vrs['referer_parsed'] = referer
        yield Record(**vrs)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--cluster', default='Hahn')
    parser.add_argument('--out', default='ybro_video_plays')
    parser.add_argument('--outfile', default='out_table.txt')
    parser.add_argument('--from', default='2017-04-01')
    parser.add_argument('--to', default='2017-05-04')
    parser.add_argument(
        '--pool', default=username
    )
    parser.add_argument(
        '--job_root',
        default='home/videolog/sbr/desktop'.format(username)
    )
    args = parser.parse_args()

    cluster = getattr(clusters, args.cluster)(
        pool=args.pool, token=os.environ['YT_TOKEN']
    ).env(
        templates=dict(
            job_root=args.job_root
        ),
        parallel_operations_limit=10
    )
    get_hr_path = GetHRPath(cluster)

    job = cluster.job().env(
        packages=['tldextract', 'idna', 'requests_file'],
        package_paths=[os.getcwd()],
        parallel_operations_limit=10
    )

    tables = []
    for d in date_range(getattr(args, 'from'), getattr(args, 'to')):
        log = job.table('logs/bar-navig-log/1d/{}'.format(d))

        vc = log.qb2(
            log='bar-navig-log',
            fields=[
                'timestamp', 'yandexuid', 'geo_id',
                'url', 'parsed_http_params',
                se.dictitem('decoded_vc', from_='parsed_http_params'),
                se.custom('dec', lambda x: x[0] if x else '-', 'decoded_vc'),
            ],
            filters=[
                sf.defined('decoded_vc', 'yandexuid'),
                # sf.region_belongs([225], field='geo_id')
            ]
        ).map(
            VcParser(), memory_limit=2000
        )

        watch_log = job.table(
            'logs/bs-watch-log/1d/{}'.format(d)
        ).qb2(
            log='bs-watch-log',
            fields=[
                'yandexuid', 'url', 'referer'
            ],
            filters=[
                sf.default_filtering('bs-watch-log')
            ]
        ).filter(
            nf.and_(
                nf.custom(bool, 'yandexuid'),
                nf.custom(bool, 'url'),
                nf.custom(bool, 'referer'),
            )
        ).project(
            'referer', yandexuid=ne.custom(lambda x: str(x), 'yandexuid'),
            page_url='url'
        ).unique(
            'yandexuid', 'page_url'
        )

        out_table = get_hr_path('$job_root/{}'.format(d))
        tables.append(out_table)

        vc.join(
            watch_log, type='left', by=['yandexuid', 'page_url']
        ).project(
            ne.all(), referer_parsed=ne.custom(parse_referer, 'referer')
        ).map(
            referer_rewrite
        ).put(
            out_table
        )

    job.run()

    for table in tables:
        cluster.driver.client.set_attribute(
            table,
            '_read_schema',
            schema
        )
        with codecs.open(args.outfile, 'w', 'utf8') as f:
            f.write(
                json.dumps({
                    "cluster": args.cluster.lower(),
                    "table": table
                })
            )


if __name__ == "__main__":
    main()
