# -*- coding: utf-8 -*-

from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)

from qb2.api.v1 import extractors as se, filters as sf
import argparse
import nile
import datetime
import uatraits, json, re
import urllib, urlparse
import math,cgi
import pandas as pd
from itertools import product
import sys
import os


cluster = clusters.yt.Arnold(pool='vika-pavlova'
      ).env(templates=dict(job_root='//home/videolog/vika-pavlova/4167-station_cube'
                          ),
            yt_spec_defaults=dict(pool_trees=["physical"],
                                  tentative_pool_trees=["cloud"]),
            parallel_operations_limit=10
           )


def parse_session(date):
    job = cluster.job()

    log = job.table('logs/appmetrica-yandex-events/1d/' + date)

    data = log.qb2(log = 'appmetrica-events-log',
               fields = ['app_platform','device_id','event_name',
                         'raw_event_value','session_type','date','geo_id','event_timestamp','api_key',
                         se.dictitem('AppID',from_='parsed_log_line'),
                         se.dictitem('Model',from_='parsed_log_line')
                        ],
               filters = [sf.equals('api_key',999537),
                          sf.equals('session_type','SESSION_FOREGROUND'),
                          sf.or_(sf.custom(lambda x: type(x)== str and x.startswith('Video'),'event_name'),
                                 sf.equals('event_name', 'progressHeartbeatVideo')
                                )
                         ]
              ).put('$job_root/raw_data_' + date)

    job.run()


def get_host(provider, item, track):
    if provider == 'YAVIDEO_PROXY' or provider == 'YAVIDEO':
        if item != '-':
            try:
                return urlparse.urlparse(item).netloc
            except:
                return str(item).replace('https://','').replace('http://','').split('?')[0]
        else:
            try:
                return urlparse.urlparse(track).netloc
            except:
                return str(track).replace('https://','').replace('http://','').split('?')[0]
    elif provider == 'YOUTUBE':
        return 'youtube.com'
    elif provider == 'KINOPOISK':
        return 'kinopoisk.ru'
    elif provider == 'STRM':
        return 'vh'
    elif provider == 'AMEDIATEKA':
        return 'www.amediateka.ru'
    elif provider == 'IVI':
        return 'www.ivi.ru'

def map_recs(recs):
    for rec in recs:
        try:
            val = rec.raw_event_value
            v = json.loads(val.replace('\x0b', ''))
            url = urllib.unquote(v.get('videoUrl','-'))
            item = urllib.unquote(v.get('itemId','-'))
            title = urllib.unquote(v.get('itemName','-'))
            provider = urllib.unquote(v.get('provider','-'))
            track = urllib.unquote(v.get('trackId','-'))
            dur = v.get('time',-1)

            host = get_host(provider, item, track)

            yield Record(device = rec.device_id if rec.device_id else "Unknown", val = val,url = url,item = item,title = title,
                        provider = provider,dur = dur,host = host,date = rec.date,
                        event_name = rec.event_name, app = rec.AppID,
                        session_type = rec.session_type,
                        event_timestamp = rec.event_timestamp,
                        geo_id = rec.geo_id, model = rec.Model
                        )
        except:
            continue

def reduce_recs(group):
    for key, recs in group:
        event_list = []
        title = ""
        provider = "-"
        host = "-"

        tvt = 0
        prev_hb = 0
        prev_event = ''
        model = 'st1'
        for rec in recs:
            if rec['model'] == 'yandexstation_2':
                model = 'st2'
            event_list.append(rec["event_name"])
            if title == "" and rec.title != '' and rec.title != '-':
                title = rec['title']
            if provider == '-' and rec.provider != '' and rec.provider != '-':
                provider = rec.provider
            if host == '-' and rec.host != '' and rec.host != '-':
                host = rec.host
            cur_hb = rec["event_timestamp"]
            cur_event = rec["event_name"]
            if cur_event != 'progressHeartbeatVideo':
                prev_hb = rec["event_timestamp"]
                prev_event = rec["event_name"]
                continue
            else:
                if prev_event == 'progressHeartbeatVideo' and (cur_hb - prev_hb) < 45:
                    tvt += (cur_hb - prev_hb)
                prev_hb = rec["event_timestamp"]
                prev_event = 'progressHeartbeatVideo'
        yield Record(fielddate = key.date, provider = provider, host = host, url = key.url,
                     title = title, device = key.device, tvt = tvt, model = model
                    )

def recs_combination(recs):
    for rec in recs:
        recs_list = list(product(
            (rec.provider, '_total_'),
            (rec.host, '_total_'),
            (rec.title, '_total_')
        ))
        for item in recs_list:
            yield Record(provider=item[0], host=item[1], title = item[2],
                         device = rec.device,
                         tvt = rec.tvt, fielddate = rec.fielddate,
                         model = rec.model
                        )

def calc_for_stat(date):

    job = cluster.job()

    raw_data = job.table('$job_root/raw_data_' + date)

    mapped_data = raw_data.filter(sf.defined('raw_event_value')).map(map_recs)
    #mapped_data = job.table('//home/videolog/vika-pavlova/4167-station_cube/mapped_data')

    reduced_data = mapped_data.groupby('date', 'url', 'device'
                                      ).sort("event_timestamp"
                                            ).reduce(reduce_recs
                                                    )

    top = reduced_data.filter(sf.not_(sf.equals('title', ""))
                             ).groupby('title'
                                      ).aggregate(total_tvt = na.sum('tvt')
                                                 ).top(50, by='total_tvt'
                                                      )

    with_top = reduced_data.join(top, by = 'title', type = 'left'
                                ).project(ne.all(),
                                          title = ne.custom(lambda x, y: x if y else 'other',
                                                            'title', 'total_tvt')
                                         ).sort('title'
                                               )

    with_top.map(recs_combination
                ).groupby('fielddate', 'provider', 'host', 'title'
                         ).aggregate(device_count = na.count_distinct('device'),
                                     st1_device_count = na.count_distinct('device', predicate = nf.custom(lambda x: x == 'st1', 'model')),
                                     st2_device_count = na.count_distinct('device', predicate = nf.custom(lambda x: x == 'st2', 'model')),
                                     st1_tvt = na.sum('tvt', predicate = nf.custom(lambda x: x == 'st1', 'model')),
                                     st2_tvt = na.sum('tvt', predicate = nf.custom(lambda x: x == 'st2', 'model')),
                                     tvt = na.sum('tvt'),
                                    ).sort('tvt').put('$job_root/final_' + date)

    job.run()

def put_to_stat(date):
    client = ns.StatfaceClient(
        proxy = 'upload.stat.yandex-team.ru',
        token = os.environ['STAT_TOKEN']
    )
    ns.StatfaceReport().path('Video.All/station_cube') \
                       .scale('daily') \
                       .client(client) \
                       .remote_publish(proxy='arnold',
                                       table_path='//home/videolog/vika-pavlova/4167-station_cube/final_' + date,
                                       async_mode=False,
                                       upload_config=False)


def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--start_date', type=str, required=True)
    parser.add_argument('--end_date', type=str, required=True)
    args = parser.parse_args()

    for date in pd.date_range(start=args.start_date, end=args.end_date):
        date_str = str(date)[:10]
        parse_session(date_str)
        calc_for_stat(date_str)
        put_to_stat(date_str)

    cluster.driver.remove('//home/videolog/vika-pavlova/4167-station_cube/raw_data_' + date_str)


if __name__ == '__main__':
    main()
