#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import sys
import os
import codecs
import argparse
from nile.api.v1 import (
    clusters,
    filters as nf,
    extractors as ne,
    aggregators as na,
    with_hints,
    Record
)
from qb2.api.v1 import resources as qr, typing as qt
import getpass
import datetime
import hashlib
import json
import re
import urlparse
from pytils import date_range

ASNAME_TABLE = (
    '//home/search-research/ensuetina/AS_MAP/proper_AS_names_corrected'
)
DATE = datetime.date(2018, 5, 23)
DATE_F = DATE.strftime('%Y-%m-%d')
re_yu = re.compile(ur'/yandexuid=([0-9]+?)/')
re_chid = re.compile(ur'/channel_id=([0-9a-z]+?)/')
re_source = re.compile(ur'/source=([0-9a-z]+?)/')
job_root='//home/videolog/mma_1861/jst_parsed'


def parse_location(location):
    result = {}
    parsed = urlparse.urlparse(location)
    qs = urlparse.parse_qs(parsed.query)
    try:
        result['vcid'] = qs['video_content_id'][0]
    except:
        pass
    try:
        mq_url = qs['mq_url'][0]
        mq_url_parsed = urlparse.urlparse(mq_url)
        mq_qs = urlparse.parse_qs(mq_url_parsed.query)
        result['hash'] = mq_qs['hash'][0]
        result['yandexuid'] = mq_qs['yandexuid'][0]
    except:
        pass
    return result


def get_info_from_source_config(source_config):
    result = {}
    try:
        init_event = source_config['trackings']['trackingEvents'][0]['init'][0]
    except (KeyError, ValueError, IndexError, AttributeError):
        # yield Record(event='bad_init')
        return
    try:
        result['yandexuid'] = re_yu.search(init_event).group(1)
    except (KeyError, ValueError, IndexError, AttributeError):
        # yield Record(event='bad_yu')
        result['yandexuid'] = None
    try:
        result['source'] = re_source.search(init_event).group(1)
    except (KeyError, ValueError, IndexError, AttributeError):
        result['source'] = None
    try:
        result['channel_id'] = re_chid.search(init_event).group(1)
    except (KeyError, ValueError, IndexError, AttributeError):
        result['channel_id'] = None
    try:
        result['vcid'] = source_config['adConfig']['videoContentId']
    except (KeyError, ValueError, IndexError, AttributeError, TypeError):
        # yield Record(event='bad_vcid', source=source)
        pass
    return result


def get_hash(yandexuid, salt='e0440ebc0786e3d2cff6ef51319bc226'):
    md5 = hashlib.md5(yandexuid + salt)
    return md5.hexdigest()


def optionalize_schema(schema):
    return {k: qt.Optional[v] for k, v in schema.items()}


@with_hints(
    output_schema=optionalize_schema(dict(
        reason=qt.String,
        timestamp=qt.Integer,
        ip=qt.String,
        hash=qt.String,
        vcid=qt.String,
        yandexuid=qt.String,
        channel_id=qt.String,
        source=qt.String,
    ))
)
def extract_jstracer(records, reason_filter=True):
    for rec in records:
        try:
            data = json.loads(rec['Data'])
        except:
            # yield Record(event='bad_data')
            continue
        event = data.get('eventName', '')
        if event not in {
                '30SecHeartbeat', 'Stalled',
                'ConnectionSpeedChange', 'LowConnectionSpeedAlertClick'
            }:
            # yield Record(event='bad_event')
            continue
        reason = ''
        if reason_filter and event == 'Stalled':
            try:
                reason = data['data']['reason']
            except:
                pass
        try:
            ip = data['jstracer_info']['client_ip']
        except (KeyError, ValueError, IndexError, AttributeError):
            ip = None
        try:
            ts = int(round((data['timestamp'] or 0.0) / 1000.0))
        except (KeyError, ValueError, IndexError, AttributeError):
            # yield Record(event='bad_server_time')
            continue
        info = {}
        try:
            if event == '30SecHeartbeat':
                source_config = data['data']['source']
            elif event == 'Stalled':
                source_config = data['data']['details']['sourceConfig']
            info.update(get_info_from_source_config(source_config))
        except:
            # yield Record(event='hb/stalled')
            pass
        try:
            info.update(parse_location(data['location']))
        except:
            pass
        info['reason'] = reason
        info['timestamp'] = ts
        info['event'] = event
        info['ip'] = ip
        info['vsid'] = data.get('sid', None)
        if info.get('yandexuid') and not info.get('hash'):
            info['hash'] = get_hash(info['yandexuid'])
        if not info.get('yandexuid') and not info.get('hash'):
            continue
        yield Record(
            **info
            # yandexuid=yandexuid,
            # channel_id=channel_id,
            # source=source,
            # reason=reason,
            # timestamp=ts,
            # ip=ip,
            # vcid=vcid,
            # vsid=vsid,
            # event=event
        )


def get_as(ip, ip_origins):
    if ip:
        try:
            asn = ip_origins.region_by_ip(ip)[0]
        except:
            asn = '-'

        return asn
    else:
        return '-'


def process_date(date, hahn):
    date_s = date.strftime('%Y-%m-%d')
    date_reversed = date.strftime('%d-%m-%Y')

    job = hahn.job().env(parallel_operations_limit=10)

    # concat = job.table(
    #     'home/videolog/strm_meta/iron_branch/concat'
    # ).project(
    #     'computed_channel', 'computed_program', 'JoinKey'
    # )

    asname_dict = {rec.ASN: rec.ISP for rec in hahn.read(
        ASNAME_TABLE
    )}

    preaggr = job.table(
        'home/js_tracer/day_by_day/{}'.format(date_reversed)
    ).map(
        extract_jstracer
    ).project(
        ne.all(),
        AS=ne.custom(
            lambda x, y: get_as(x, y), 'ip', qr.resource('IpOrigins')
        ),
        provider=ne.custom(
            lambda x: asname_dict.get(x, 'other'), 'AS'
        )
    ).sort(
        'hash', 'timestamp'
    ).put(
        '{}/{}/preaggregated'.format(job_root, date)
    ).groupby(
        'event', 'reason'
    ).aggregate(
        count=na.count()
    ).put(
        '{}/{}/events'.format(job_root, date)
    )

    job.run()


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--from')
    parser.add_argument('--to')
    args = parser.parse_args()

    from_ = getattr(args, 'from')
    to_ = getattr(args, 'to')

    args = parser.parse_args()
    hahn = clusters.yt.Hahn(
        pool='search-research_{}'.format(getpass.getuser()),
        token=open('/home/pecheny/.yt/token').read().strip(),
        # yql_token=open('/home/pecheny/.yql_token').read().strip(),
    ).env(
        templates=dict(
            # job_root='home/videolog/users_by_js_tracer',
            date=DATE_F
        )
    )

    for date in date_range(from_, to_):
        process_date(date, hahn)


if __name__ == "__main__":
    main()
