#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import sys
import os
import codecs
import argparse
import datetime
from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    clusters,
    Record
)
import re
import math
import json

from qb2.api.v1 import extractors as se, filters as sf

import nile


def date_range(from_, to_):
    if isinstance(from_, basestring):
        from_ = datetime.datetime.strptime(from_, '%Y-%m-%d').date()
    if isinstance(to_, basestring):
        to_ = datetime.datetime.strptime(to_, '%Y-%m-%d').date()
    mvr = min([from_, to_])
    result = []
    while mvr <= max([from_, to_]):
        result.append(mvr)
        mvr += datetime.timedelta(days=1)
    if to_ < from_:
        result = result[::-1]
    return result


class VcParser(object):

    def __init__(self, date):
        self.date = date

    def __call__(self, recs):
        re_yandex = re.compile(
            r'yandex\.(.+)/video'
        )
        for rec in recs:
            try:
                dec = json.loads(rec.dec)
            except (TypeError, ValueError):
                continue
            for d in dec:
                url = d.keys()[0] or ''
                try:
                    p = d[url]['p']
                except KeyError:
                    continue
                if not isinstance(p, list):
                    continue
                for el in p:
                    if len(el) == 10:
                        length = float(el[4])
                        max_length = float(el[3])
                    elif len(el) == 11:
                        length = float(el[5])
                        max_length = float(el[4])
                    elif len(el) == 13:
                        if el[4] == 'live':
                            el[4] = 0
                        length = float(el[5])
                        max_length = float(el[4])
                    else:
                        continue
                    if max_length > 0 and length > max_length:
                        continue
                    if len(el) == 13:
                        frame_url = el[11] or ''
                    else:
                        frame_url = url or ''
                    try:
                        if re_yandex.search(url) or re_yandex.search(
                            str(rec.referer)
                        ):
                            continue
                    except Exception:
                        raise Exception(
                            'URL: {}, REFERER: {}'.format(
                                url, rec.referer
                            )
                        )
                    if (
                        not url or not frame_url or
                        not url.startswith('http') or
                        not frame_url.startswith('http')
                    ):
                        continue
                    if length < 30:
                        lvt = 0
                    else:
                        lvt = math.log(length - 25)
                    result = dict(
                        frame_url=frame_url,
                        yandexuid=rec.yandexuid,
                        length=length,
                        lvt=lvt
                    )
                    # for i, x in enumerate(el[::-1]):
                    #     result["el_{:02}".format(i)] = x
                    yield Record(**result)


class StatsParser(object):

    def __init__(self, date):
        self.date = parse_date(date)

    def __call__(self, records):
        re_yandex = re.compile(
            r'yandex\.(.+)/video'
        )
        for rec in records:
            device_id = rec.device_id
            val = rec.raw_event_value

            try:
                obj = json.loads(val)

                frame_url = obj.get('frame url') or ''
                page_url = obj.get('page_url') or ''
                play_length = int(obj.get('play length'))
                try:
                    play_length = int(obj.get('play length'))
                except (TypeError, AttributeError, ValueError):
                    play_length = 0
                # try:
                #     length = int(obj.get('length'))
                # except (TypeError, AttributeError, ValueError):
                #     length = 0
            except (TypeError, AttributeError, ValueError):
                continue

            if re_yandex.search(frame_url) or re_yandex.search(page_url):
                continue

            if not play_length or play_length > 86400:
                continue

            if play_length > 30:
                lvt = math.log(play_length, math.e)
            else:
                lvt = 0

            yield Record(
                device_id=device_id,
                frame_url=frame_url,
                page_url=page_url,
                length=play_length,
                lvt=lvt
            )


class DataReducer(object):

    def __init__(self, date):
        self.date = parse_date(date)

    def __call__(self, groups):
        for key, records in groups:
            result = {}
            for rec in records:
                if not result:
                    result.update(vars(rec))
                else:
                    result['data'].update(rec.data)
            result['data'] = {
                k: v for k, v in result['data'].items()
                if parse_date(k) >= (self.date - datetime.timedelta(days=180))
            }
            if result['data']:
                yield Record(**result)


def parse_date(s):
    try:
        return datetime.datetime.strptime(s, '%Y-%m-%d').date()
    except (TypeError, ValueError, AttributeError):
        return


re_date = re.compile(r'[0-9]{4}-[0-9]{2}-[0-9]{2}')


def get_date_from_table(table):
    try:
        return re_date.search(table).group(0)
    except AttributeError:
        return


def process_table(table, table_additive, hahn, field):
    date = get_date_from_table(table)
    job = hahn.job()
    vc_parsed = job.table(table, ignore_missing=True).qb2(
        log='metrika-mobile-log',
        fields=[
            'app_platform', 'device_id',
            'event_name', 'raw_event_value',
            'session_type', 'date', 'geo_id',
            'event_timestamp',
            se.dictitem('AppID', from_='parsed_log_line')
        ],
        filters=[
            sf.equals('event_name', 'video statistics'),
            sf.region_belongs([225], field='geo_id')
        ]
    ).map(
        StatsParser(date)
    ).groupby('device_id', field).aggregate(
        length=na.sum('length'),
        lvt=na.sum('lvt')
    ).filter(
        nf.custom(lambda x: 0 < x <= 86400, 'length')
    ).groupby(
        field
    ).aggregate(
        tvt=na.sum('length'),
        lvt=na.sum('lvt'),
        users=na.count_distinct_estimate('device_id'),
        shows=na.count(),
        avg_vt=na.mean('length')
    ).project(
        field,
        data=ne.custom(
            lambda t, l, u, s, a: {
                date: {
                    'tvt': t, 'lvt': l, 'users': u, 'shows': s, 'avg_vt': a
                }
            }, 'tvt', 'lvt', 'users', 'shows', 'avg_vt'
        )
    )

    job.concat(
        vc_parsed,
        job.table(table_additive, ignore_missing=True)
    ).groupby(field).reduce(
        DataReducer(date)
    ).sort(field).put(
        table_additive
    )
    last_date = parse_date(hahn.driver.client.get_attribute(
        table_additive, 'last_date'
    ))
    date_parsed = parse_date(date)
    if date_parsed > last_date and date_parsed <= datetime.date.today():
        hahn.driver.client.set_attribute(
            table_additive, 'last_date', date
        )
    job.run()


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--from')
    parser.add_argument('--to')
    parser.add_argument('--token')
    parser.add_argument(
        '--additive',
        default='//home/videolog/selrank_stats/additive_mobile'
    )
    args = parser.parse_args()

    kwargs = {}
    if args.token:
        kwargs['token'] = args.token
    hahn = clusters.yt.Hahn(**kwargs)

    table_additive = args.additive
    if not hahn.driver.exists(table_additive):
        hahn.write(table_additive, [])
        hahn.driver.client.set_attribute(
            table_additive,
            'last_date',
            (
                datetime.date.today() - datetime.timedelta(days=180)
            ).strftime('%Y-%m-%d')
        )

    last_date = parse_date(hahn.driver.client.get_attribute(
        table_additive, 'last_date'
    ))
    current_date = datetime.date.today()

    from_ = parse_date(getattr(args, 'from'))
    to_ = parse_date(getattr(args, 'to'))

    if not from_ or not to_:
        tables = list(
            hahn.driver.client.search(
                root='//logs/metrika-mobile-log/1d',
                path_filter=(
                    lambda x: parse_date(get_date_from_table(x)) and
                    parse_date(get_date_from_table(x)) > last_date and
                    parse_date(get_date_from_table(x)) <= current_date
                )
            )
        )
        print(format(tables))
        for table in tables:
            if hahn.driver.exists(table):
                process_table(table, table_additive, hahn)
        print('finished')
    else:
        for date in date_range(from_, to_):
            table = '//logs/metrika-mobile-log/1d/{}'.format(date)
            if hahn.driver.exists(table):
                process_table(table, table_additive, hahn)


if __name__ == "__main__":
    main()
