#!/usr/bin/env python
# -*- coding: utf-8 -*-
# from __future__ import unicode_literals
from __future__ import division
import sys
import os
import codecs
import argparse
import json

from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    clusters,
    Record
)

from qb2.api.v1 import extractors as se, filters as sf

import nile
from nile.utils.misc import coerce_path
# import libra
import datetime
import uatraits
import urllib
import re
import random
import urlparse
import getpass

username = "videolog"

date_format = '%Y-%m-%d'


class GetProperty(object):

    def __init__(self, prop):
        self.prop = prop

    def __call__(self, val):
        try:
            obj = json.loads(val)
            assert isinstance(obj, dict)
        except:
            return '-'

        result = unicode(obj.get(self.prop, 'None'))
        if isinstance(result, basestring):
            return result[:500].encode('utf-8')
        else:
            return result


def process_netloc(netloc):
    for prefix in ['www.', 'm.']:
        if netloc.startswith(prefix):
            netloc = netloc[len(prefix):]
    return netloc


def is_yandex(netloc):
    sp = netloc.split('.', 1)
    if len(sp) < 2:
        return False
    a, b = sp
    if a == 'yandex' and b in {'ru', 'ua', 'com.tr', 'by', 'kz'}:
        return True
    return False


def make_good_referer(url):
    p = urlparse.urlparse(url)
    netloc = process_netloc(p.netloc)
    return netloc + '/'.join(p.path.split('/')[:2])


def parse_referer(url):
    if not url:
        return
    p = urlparse.urlparse(url)
    reduced = make_good_referer(url)
    if reduced == 'yandex.ru/clck':
        qs = urlparse.parse_qs(p.query)
        try:
            unquoted = urlparse.unquote(qs['from'][0]).replace(';', '/')
        except KeyError:
            return
        reduced2 = make_good_referer(unquoted)
        return reduced2
    return reduced


def referer_rewrite(records):
    for rec in records:
        try:
            p = urlparse.urlparse(rec.page_url)
        except AttributeError:
            continue
        netloc = process_netloc(p.netloc)
        if not is_yandex(netloc):
            yield rec
            continue
        referer = make_good_referer(rec.page_url)
        vrs = vars(rec)
        vrs['referer_parsed'] = referer
        yield Record(**vrs)


schema = [
    {"type": "string", "name": "AppID"},
    {"type": "string", "name": "app_platform"},
    {"type": "string", "name": "autoplay"},
    {"type": "string", "name": "device_id"},
    {"type": "string", "name": "event_datetime"},
    {"type": "int64", "name": "event_timestamp"},
    {"type": "string", "name": "event_name"},
    {"type": "string", "name": "frame_url"},
    {"type": "string", "name": "page_url"},
    {"type": "string", "name": "raw_event_value"},
    {"type": "string", "name": "session_type"},
    {"type": "string", "name": "src_url"},
    {"type": "string", "name": "referer"},
    {"type": "string", "name": "referer_parsed"},
]


class GetHRPath(object):

    def __init__(self, cluster):
        self.cluster = cluster

    def __call__(self, path):
        path = str(
            coerce_path(
                path
            ).eval(**self.cluster.environment.templates)
        )
        if not path.startswith('//'):
            return '//' + path
        return path


def date_range(from_, to_):
    if isinstance(from_, basestring):
        from_ = datetime.datetime.strptime(from_, '%Y-%m-%d').date()
    if isinstance(to_, basestring):
        to_ = datetime.datetime.strptime(to_, '%Y-%m-%d').date()
    mvr = min([from_, to_])
    result = []
    while mvr <= max([from_, to_]):
        result.append(mvr)
        mvr += datetime.timedelta(days=1)
    if to_ < from_:
        result = result[::-1]
    return result


get_hr_path = None


def det_dt(edt):
    return edt.split(' ')[0] + '_' + edt.split(' ').split(':')[0]


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--cluster', default='Hahn')
    parser.add_argument('--out', default='ybro_video_plays')
    parser.add_argument('--outfile', default='out_table.txt')
    parser.add_argument('--from', default='2017-04-01')
    parser.add_argument('--to', default='2017-05-04')
    parser.add_argument(
        '--pool', default=username
    )
    parser.add_argument(
        '--job_root',
        default='home/videolog/{}/SBR/FRESH/AUTOPLAYS'.format(username)
    )
    args = parser.parse_args()

    cluster = getattr(clusters, args.cluster)(
        pool=args.pool, token=os.environ['YT_TOKEN']
    ).env(
        templates=dict(
            job_root=args.job_root
        ),
        parallel_operations_limit=10
    )
    get_hr_path = GetHRPath(cluster)

    job = cluster.job()
    tables = []
    for d in date_range(getattr(args, 'from'), getattr(args, 'to')):
        log = job.table('logs/metrika-mobile-log/1d/{}'.format(d))

        fj = log.qb2(
            log='metrika-mobile-log',
            fields=[
                'device_id', 'event_name', 'raw_event_value',
                se.custom(
                    'referer', GetProperty('referer'), 'raw_event_value'
                ),
                se.custom(
                    'page_url', GetProperty('url'), 'raw_event_value'
                )
            ],
            filters=[sf.equals('event_name', 'url opened')]
        ).unique('device_id', 'page_url')

        x = log.qb2(
            log='metrika-mobile-log',
            fields=[
                'app_platform', 'device_id', 'event_name', 'raw_event_value',
                'session_type', 'event_timestamp', 'event_datetime',
                se.dictitem('AppID', from_='parsed_log_line'),
                se.custom(
                    'page_url', GetProperty('page url'), 'raw_event_value'
                ),
                se.custom(
                    'frame_url', GetProperty('frame url'), 'raw_event_value'
                ),
                se.custom(
                    'src_url', GetProperty('src url'), 'raw_event_value'
                ),
                se.custom(
                    'autoplay', GetProperty('autoplay'), 'raw_event_value'
                )
            ],
            filters=[
                sf.equals('event_name', 'video play'),
            ]
        ).join(fj, type='left', by=['device_id', 'page_url']).project(
            ne.all(), referer_parsed=ne.custom(parse_referer, 'referer')
        ).map(referer_rewrite)

        tables.append(x)

    job.concat(
        *tables
    ).put(
        '$job_root/{}'.format(args.out)
    )

    job.run()

    out_table = get_hr_path('$job_root/{}'.format(args.out))

    cluster.driver.client.set_attribute(
        out_table,
        '_read_schema',
        schema
    )

    with codecs.open(args.outfile, 'w', 'utf8') as f:
        f.write(
            json.dumps({
                "cluster": args.cluster.lower(),
                "table": out_table
            })
        )


if __name__ == "__main__":
    main()
