#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import sys
import os
import codecs
import argparse
from nile.api.v1 import (
    clusters,
    filters as nf,
    extractors as ne,
    aggregators as na,
    Record,
    with_hints
)
from qb2.api.v1 import (
    filters as sf, extractors as se, typing as qt, resources as qr
)
import getpass
import json
import datetime
import gc
import os
import hashlib
import urlparse
import re
from strm_cube_2_common import StrmParser
from memory_profiler import profile

ASNAME_TABLE = (
    '//home/search-research/ensuetina/AS_MAP/proper_AS_names_corrected'
)
asname_dict = None

# /dvr/edatv/edatv0_169_576p-1482310130000.ts
request_re = re.compile(r'^\/dvr\/(.*)/.*-(.*)\.ts')
# /kal/ntv_cv/ntv_cv0_169_576p.json/seg-300660927-v1-a1.ts
request_kal_re = re.compile(r'^\/kal\/(.*)/(.*)/.*-([0-9]+)-.*\.ts')
# /vh-ntv-converted/vod-content/138931_169_576p7.ts
vod_request_re = re.compile(r'vh-(.*)-converted/vod-content/.*\.ts')
zen_request_re = re.compile(r'/(.*)-vod/vod-content/.*\.ts')
re_yu = re.compile(ur'/yandexuid=([0-9]+?)/')
re_yu2 = re.compile(ur'"yaUID":"([0-9]+?)"')
re_reqid = re.compile(ur'/reqid=([0-9\.\-]+?)/')
re_chid = re.compile(ur'/channel_id=([0-9a-z]+?)/')
re_source = re.compile(ur'/source=([0-9a-z]+?)/')
re_pv = re.compile(
    'yastatic.net/yandex-video-player-iframe-api-bundles/([0-9\-\.]+)/'
)
RETRANS = 'tcpinfo_total_retrans'


def get_player_version(s):
    srch = re_pv.search(s)
    if srch:
        return srch.group(1)
    return "-"

def get_error_vsid(parsed_parameters):
    try:
        src = parsed_parameters.get('src')
        pp = parse_params(src)
        vsid = pp.get('vsid')
        return vsid
    except:
        return None

def get_resolution(page):
    page = page.split('.')[0].split('_')[-1].split('p')[0]
    try:
        assert len(page) <= 4
        int(page)
        return page
    except (AssertionError, ValueError, TypeError, AttributeError):
        return '-'


def parse_as(as_, asname_dict):
    result = set()
    for as__ in (as_ or []):
        result.add(asname_dict.get(as__, 'other'))
    if result != {"other"}:
        result -= {"other"}
    return sorted(result)


def get_as(ip, ip_origins):
    if ip:
        try:
            asn = ip_origins.region_by_ip(ip)
        except:
            asn = ['-']
        return asn
    else:
        return ['-']


def parse_params(url):
    parsed = urlparse.urlparse(url)
    qs = urlparse.parse_qs(parsed.query)
    return {k: v[0] for k, v in qs.items()}


def get_request_params_from_url(request):
    if not request or len(request) < 20:
        return
    match = request_re.findall(request)

    if request.find('cdn1tvru/live') > -1:
        return ['1tv', 0, 'live']

    if request.find('cdn1tvru/vod') > -1:
        return ['1tv', 0, 'vod']

    if match:
        return [match[0][0], int(int(match[0][1]) / 1000), '']

    match = request_kal_re.findall(request)

    if match:
        return [match[0][0], int(5 * int(match[0][2])), '']

    match = vod_request_re.findall(request)

    if match:
        return [match[0], 0, 'vod']

    match = zen_request_re.findall(request)

    if match:
        return [match[0], 0, 'vod']

    if '/get-video-hosting' in request:
        return ['ott', 0, 'vod']

    return ''


def get_qs_param(params, name):
    params = params or {}
    if name in params and params[name]:
        if isinstance(params[name], list):
            return params[name]
        return params[name]
    return ""


def get_qs_param_wrapper(name, *params_list):
    for params in params_list:
        qsp = get_qs_param(params, name)
        if qsp:
            return qsp
    return ""


def get_detailed_error_id(error_id, details):
    if not isinstance(details, dict):
        return error_id
    reason = details.get('reason', '')
    if reason:
        return '{}_{}'.format(error_id, reason)
    else:
        return error_id


def get_category_id(parameters, referer_parameters):
    from_params = get_qs_param(parameters, 'video_category_id')
    if from_params:
        return from_params
    from_referer_params = get_qs_param(referer_parameters, 'category')
    return from_referer_params


def get_hash(yandexuid, salt='e0440ebc0786e3d2cff6ef51319bc226'):
    md5 = hashlib.md5(yandexuid + salt)
    return md5.hexdigest()


def get_error_details(params):
    try:
        return json.loads(params['details'])
    except (KeyError, IndexError):
        return {}
    except ValueError:
        return {'unparsed': params['details']}


def get_error_id(parsed_parameters, error_details, fatal=False):
    try:
        result = ''
        event_id = parsed_parameters.get('event_id', [''])
        error_id = parsed_parameters.get('error_id', ['']) or ''
        if event_id in {
            'Buffer.Empty', 'InvalidFragDuration', 'RecoverStreamError',
            'BufferEmptyCritical', 'Buffer.Empty.Critical'
        }:
            result = event_id or ''
        elif event_id in {
            'PlayerStalled', 'PlayedStalled', 'Stalled'
        }:
            result = get_detailed_error_id(event_id, error_details)
        elif error_id:
            result = error_id or ''
        if fatal:
            result += '_fatal'
        result = (result or '').replace('PlayedStalled', 'Stalled')
        return result
    except:
        return None


def parse_slots(slots):
    if (slots is None) or (slots == ''):
        return []

    try:
        return [pair.split(',')[0] for pair in slots.split(';')]
    except:
        raise Exception(repr(slots))


def get_country(region):
    for reg in region.path:
        if reg.type == 3:
            return reg.short_name
    return 'UNK'



def get_common_params(
        result, referer_params, mq_url_params
    ):
        def getter(x):
            return get_qs_param_wrapper(x, referer_params, mq_url_params)
        result['ref_from'] = getter('from')
        result['ref_from_block'] = getter('from_block')
        slots = getter('slots')
        result['slots_arr'] = parse_slots(slots)
        result['reqid'] = getter('reqid')
        result['partner_id'] = getter('partner_id')
        result['yandexuid'] = getter('yandexuid')
        result['yu_hash'] = getter('hash')
        if result['yandexuid'] and not result['yu_hash']:
            result['yu_hash'] = get_hash(result['yandexuid'])
        result['user_id'] = (
            result['yu_hash'] or
            'vsid_{}'.format(result['vsid'])
        )
        result['video_content_id'] = getter('video_content_id')
        result['add_info']['vc_name'] = getter('video_content_name')

    # @with_hints(output_schema=preprocessed_schema)

def call(recs):
    import uatraits
    detector = uatraits.detector('/usr/share/uatraits/browser.xml')
    geobase = qr.get('Geobase')
    ip_origins = qr.get('IpOrigins')
    for rec in recs:
        if (
            'for-regional-cache=1' in rec.request or
            'monitoring=' in rec.request or
            'hide_stub=' in rec.request or
            'dvrpy=' in rec.request
        ):
            continue
        if ('.ts' in rec.request or '.mp4' in rec.request) and (
            rec.status in {'200', '206'}
        ):
            result = {'event': 'chunk'}
            params_from_url = get_request_params_from_url(rec.request)
            if not params_from_url or not params_from_url[0]:
                continue
        elif rec.request.startswith(
            '/log/'
        ) and rec.status in {'200', '206'}:
            result = {'event': 'error'}
        else:
            continue
        result['source'] = 'strm'
        result['add_info'] = {}
        result['source_data'] = {
            'referer': rec.referer,
            'request': rec.request,
            'user_agent': rec.user_agent
        }
        try:
            browser = detector.detect(rec.user_agent)
        except (AttributeError, ValueError):
            continue
        try:
            result['timestamp'] = int(rec.timestamp)
        except (AttributeError, ValueError):
            continue
        geo_region = geobase.region_by_ip(rec.ip)
        geo_country = get_country(geo_region)
        result['region'] = geo_region.id
        result['country'] = geo_country
        result['ip'] = rec.ip
        result['add_info']['as'] = get_as(rec.ip, ip_origins)
        result['add_info']['as_parsed'] = parse_as(
            result['add_info']['as'], asname_dict
        )
        result['browser_name'] = str(browser.get('BrowserName'))
        result['browser_version'] = str(browser.get('BrowserVersion'))
        result['os_family'] = str(browser.get('OSFamily'))
        result['add_info'][RETRANS] = int(getattr(rec, RETRANS, 0)) or 0
        result['add_info']['bytes_sent'] = int(rec.bytes_sent) or 0
        request_params = parse_params(rec.request)
        referer_params = parse_params(rec.referer)
        mq_url_params = parse_params(
            referer_params.get('mq_url', '')
        )
        if result['event'] == 'chunk':
            result['vsid'] = get_qs_param_wrapper(
                'vsid', request_params, referer_params, mq_url_params
            )
            if not result['vsid'] or len(result['vsid']) != 64:
                continue
            result['category_id'] = get_category_id(
                request_params, referer_params
            )
            result['player_version'] = get_player_version(
                rec.referer
            )
            result['resolution'] = get_resolution(
                rec.request
            )
            get_common_params(
                result, referer_params, mq_url_params
            )
            result['channel_old'] = params_from_url[0]
            result['add_info']['request_ts'] = params_from_url[1]
            result['view_type'] = params_from_url[2]
            if (
                result['channel_old'] == 'zen' and
                not result['video_content_id']
            ):
                result['video_content_id'] = 'novcid'
        elif result['event'] == 'error':
            params = parse_params(rec.request)
            result['vsid'] = get_error_vsid(params)
            if not result['vsid'] or len(result['vsid']) != 64:
                continue
            get_common_params(
                result, referer_params, mq_url_params
            )
            result['add_info']['error_details'] = get_error_details(
                params
            )
            result['fatal'] = params.get('fatal', 'false') == 'true'
            result['error_id'] = get_error_id(
                params, result['add_info']['error_details'],
                fatal=result['fatal']
            )
            if not result['video_content_id']:
                result['video_content_id'] = 'novcid'
            if not result['error_id']:
                continue
        yield Record(**result)
        del result
        # gc.collect()


def get_sample():
    sample = []
    with codecs.open('20180814strm_small.jsonlines', 'r', 'utf8') as f:
        for line in f:
            try:
                sample.append(Record.from_dict(to_bytes(json.loads(line))))
            except:
                continue
    return sample


def to_bytes(dct):
    return {
        k.encode('utf8'): (v.encode('utf8') if isinstance(v, unicode) else v) for k, v in dct.items()
    }

def main():
    global asname_dict
    hahn = clusters.yql.Hahn(
        pool='search-research_{}'.format(getpass.getuser()),
        token=open('/home/pecheny/.yt/token').read().strip(),
        yql_token=open('/home/pecheny/.yql_token').read().strip(),
    )

    asname_dict = {
        rec.ASN: rec.ISP
        # for rec in cluster.read(ASNAME_TABLE)
        for rec in hahn.driver.yt_driver.read(ASNAME_TABLE)
    }

    job = hahn.job()
    sample = get_sample()
    o = []
    sp = StrmParser(asname_dict)
    sp.__name__ = 'foo'
    spf = profile()(sp)

    # import pdb; pdb.set_trace()

    job.table('').debug_input(
        sample
    ).map(
        call
    ).debug_output(
        o
    )
    import line_profiler

    profiler = line_profiler.LineProfiler()
    profiler.add_function(call)
    profiler.enable()
    job.debug_run()
    profiler.disable()
    profiler.dump_stats('profile2.prof')


if __name__ == "__main__":
    main()

