#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import sys
import os
import codecs
import argparse
from nile.api.v1 import (
    clusters,
    filters as nf,
    extractors as ne,
    aggregators as na,
    Record
)
import getpass
import datetime
import json
import re
from pytils import date_range

DATE = datetime.date(2018, 5, 23)
DATE_F = DATE.strftime('%Y-%m-%d')
re_yu = re.compile(ur'/yandexuid=([0-9]+?)/')
re_chid = re.compile(ur'/channel_id=([0-9a-z]+?)/')
re_source = re.compile(ur'/source=([0-9a-z]+?)/')


def extract_jstracer(records):
    for rec in records:
        try:
            data = json.loads(rec.Data)
        except:
            continue
        event = data.get('eventName', '')
        if event not in {'Start', '30SecHeartbeat'}:
            continue
        try:
            yandexuid = re_yu.search(
                data['data']['source']['trackings'][
                    'trackingEvents'
                ][0]['init'][0]
            ).group(1)
        except (KeyError, ValueError, IndexError, AttributeError):
            continue
        try:
            source = re_source.search(
                data['data']['source']['trackings'][
                    'trackingEvents'
                ][0]['init'][0]
            ).group(1)
        except (KeyError, ValueError, IndexError, AttributeError):
            source = None
        try:
            channel_id = re_chid.search(
                data['data']['source']['trackings'][
                    'trackingEvents'
                ][0]['init'][0]
            ).group(1)
        except (KeyError, ValueError, IndexError, AttributeError):
            channel_id = None
        try:
            vcid = data['data']['source']['adConfig']['videoContentId']
        except KeyError:
            continue
        vsid = data.get('sid', None)
        yield Record(
            yandexuid=yandexuid,
            channel_id=channel_id,
            source=source,
            vcid=vcid,
            vsid=vsid,
            event=event
        )


def process_date(date, hahn):
    date_s = date.strftime('%Y-%m-%d')
    date_reversed = date.strftime('%d-%m-%Y')

    job = hahn.job().env(parallel_operations_limit=10)

    concat = job.table(
        'home/videolog/strm_meta/iron_branch/concat'
    ).project(
        'computed_channel', 'computed_program', 'JoinKey'
    )

    preaggr = job.table(
        'home/js_tracer/day_by_day/{}'.format(date_reversed)
    ).map(
        extract_jstracer
    ).join(
        concat, type='left', by_left='vcid', by_right='JoinKey'
    ).project(
        'event', 'yandexuid', 'vcid', 'vsid', 'computed_channel',
        'channel_id', 'source',
        'computed_program'
    ).put(
        '$job_root/{}/joined_with_channels'.format(date), schema=dict(
            yandexuid=str, vcid=str, event=str, source=str,
            computed_channel=str, computed_program=str, vsid=str,
            channel_id=str
        )
    )

    preaggr.groupby(
        'yandexuid', 'computed_channel', 'event'
    ).aggregate(
        event_count=na.count()
    ).put(
        '$job_root/{}/aggr_by_yu_channel'.format(date), schema=dict(
            yandexuid=str, event=str, event_count=int,
            computed_channel=str
        )
    )

    preaggr.groupby(
        'yandexuid', 'source', 'computed_channel', 'event'
    ).aggregate(
        event_count=na.count()
    ).put(
        '$job_root/{}/aggr_by_yu_channel_source'.format(date), schema=dict(
            yandexuid=str, event=str, event_count=int, source=str,
            computed_channel=str
        )
    ).filter(
        nf.equals('source', 'morda')
    ).put(
        '$job_root/{}/aggr_by_yu_channel_source_morda'.format(date),
        schema=dict(
            yandexuid=str, event=str, event_count=int, source=str,
            computed_channel=str
        )
    )

    preaggr.groupby(
        'computed_channel', 'event'
    ).aggregate(
        yandexuids=na.count_distinct('yandexuid'),
        events_total=na.count()
    ).put(
        '$job_root/{}/aggr_by_channel'.format(date_s), schema=dict(
            computed_channel=str, event=str, yandexuids=int,
            events_total=int
        )
    )

    preaggr.groupby(
        'channel_id', 'event'
    ).aggregate(
        yandexuids=na.count_distinct('yandexuid'),
        events_total=na.count()
    ).put(
        '$job_root/{}/aggr_by_channel_id'.format(date_s), schema=dict(
            channel_id=str, event=str, yandexuids=int,
            events_total=int
        )
    )

    job.run()


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--from')
    parser.add_argument('--to')
    args = parser.parse_args()

    from_ = getattr(args, 'from')
    to_ = getattr(args, 'to')

    args = parser.parse_args()
    hahn = clusters.yt.Hahn(
        pool='search-research_{}'.format(getpass.getuser()),
        token=open('/home/pecheny/.yt/token').read().strip()
    ).env(
        templates=dict(
            job_root='home/videolog/users_by_js_tracer',
            date=DATE_F
        )
    )

    for date in date_range(from_, to_):
        process_date(date, hahn)


if __name__ == "__main__":
    main()
