#!/usr/bin/env python
# -*- coding: utf-8 -*-

from nile.api.v1 import (
    Record,
    files,
    clusters,
    cli,
    with_hints,
    extended_schema,
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns #obligatory for Statface
)
from qb2.api.v1 import (
    QB2,
    resources as sr,
    filters as sf
)

import os #obligatory for Statface
import sys #obligatory for Statface
import re #obligatory for Statface
import argparse #obligatory for Statface
import getpass #obligatory for Statface
import datetime
import time


#c1 = ["click.card.daily.subscribe"]
#c2 = ["click.card.wish", "click.ya_share", "finish.web.share", "click.contest.like", "click.contest.dislike"]
#c3 = ["pass.contest.submission", "finish.card.create"]

c1 = ["finish.card.like", "finish.board.subscribe", "finish.channel.subscribe", "finish.user.subscribe", "click.card.daily.subscribe"]
c2 = ["finish.card.share", "finish.card.comment", "favorite", "click.card.wish", "click.ya_share", "finish.web.share", "click.contest.like", "click.contest.dislike"]
c3 = ["pass.contest.submission", "finish.card.create"]

scores1 = c1# + ["finish.card.like", "finish.board.subscribe", "finish.channel.subscribe", "finish.user.subscribe"]
scores2 = c2# + ["finish.card.share", "finish.card.comment", "favorite"]
scores3 = c3

@with_hints(
    output_schema=dict(
        path=str,
        device=str,
        cardId=str,
        fielddate=str,
        count=int
    )
)
def myMap_coll(recs):
    for rec in recs:
        r = rec.value
        rr = dict(x.split('=', 1) for x in r.split('\t') if '=' in x)

        if "path" not in rr:
            continue
        path = rr["path"]
        cardId = None

        if path not in c1 + c2 + c3:
            continue
        if path in ["finish.card.create"]:
            if "ar" in rr:
                ar = rr["ar"]
                if ar in ["8", "16"]:
                    continue
            if "cardId" not in rr or rr["cardId"] is None:
                continue
            cardId = rr["cardId"]
        device = None
        if "cid" in rr:
            if rr["cid"]=="73096":
                device = "touch"
            else:
                device = "desktop"
        yandexuid = rr["yandexuid"]
        date = rr["iso_eventtime"]
        yield Record(path=path, device=device, cardId=cardId, fielddate=date[:10], count=1)

@with_hints(
    output_schema=dict(
        actiond=str,
        actiontype=str,
        device=str,
        fielddate=str,
        count=int
    )
)
def myMap_coll2(recs):
    for rec in recs:
        path = rec.path
        if path in scores1:
            action_type = 1
        elif path in scores2:
            action_type = 2
        else:
            action_type = 3
        yield Record(actiond=rec.path, actiontype=str(action_type), device=rec.device, fielddate=rec.fielddate, count=rec.count)

@with_hints(
    output_schema=dict(
        path=str,
        device=str,
        fielddate=str,
        count=int
    )
)
def fav_replace_yandexApp(recs):
    for rec in recs:
        device = rec.ui
        if device=="webmobileapp":
            device="yandexApp"
        elif device=="pad":
            device="tablet"
        yield Record(fielddate=rec.fielddate, device=device, path='favorite', count=rec.hits)

def goals_id(r):
    return '22652040' in r or '20268850' in r or ('20269040' in r and '20269045' in r) or '20363050' in r or '20363055' in r or '23402120' in r or '20268855' in r or '32272887' in r

def remove_toloka(r):
    return "toloka" not in r and "role" not in r

# https://wiki.yandex-team.ru/JandexMetrika/doc/VisitLog/
@with_hints(
    output_schema=dict(
        path=str,
        device=str,
        fielddate=str,
        count=int
    )
)
def myReduce(recs):
    for key, records in recs:
        VisitID = key.VisitID
        RecountRequestID = None
        for rec in records:
            Sign = 2*float(rec.Type) - 1
            if Sign == 1:
                RecountRequestID = rec.RecountRequestID
        if not RecountRequestID is None:
            date = rec.iso_eventtime
            device="desktop"
            if rec.IsMobile!="0":
                device="touch"
            g = rec.Goals_ID
            dd = {}
    #        dd["finish.card.create"] = g.count('22652040')
            dd["finish.card.share"] = g.count('20269045')#20269040
            dd["finish.card.comment"] = g.count('20268850')
            dd["finish.channel.subscribe"] = g.count('23402120')
            dd["finish.user.subscribe"] = g.count('20363050')
            dd["finish.board.subscribe"] = g.count('20363055')
            dd["finish.card.like"] = g.count('20268855')
    #        dd["click.social.share"] = g.count('32272887')
            for k, v in dd.iteritems():
                yield Record(device=device, path=k, count=v, fielddate=date[:10])

def card_status(w):
    if w is True:
        r = '2'
    else:
        r = '3'
    return r

@with_hints(output_schema=extended_schema())
def filter_dates(recs):
    for rec in recs:
        fielddate = datetime.datetime.strptime(rec.fielddate, '%Y-%m-%d').date()
        base = datetime.datetime.strptime(rec.d, '%Y-%m-%d').date()

        end_point = base + datetime.timedelta(days=1)
        start_point = end_point - datetime.timedelta(days=30)

        if fielddate < start_point or fielddate >= end_point:
            continue
        yield rec

def s_str(md):
    start_point = datetime.datetime.strptime(min(md), '%Y-%m-%d').date() - datetime.timedelta(days=30)
    if start_point < datetime.datetime.strptime("2018-04-09", '%Y-%m-%d').date():
        start_point = "2018-04-09"
    else:
        start_point = str(start_point)
    return start_point


def filter0912(f):
    return datetime.datetime.strptime(f, '%Y-%m-%d').date() >= datetime.datetime.strptime("2018-09-12", '%Y-%m-%d').date()


@cli.statinfra_job

def make_job(job, nirvana, statface_client, options):

    job = job.env(
        yt_spec_defaults=dict(
            pool_trees=["physical"],
            tentative_pool_trees=["cloud"]
        ),
        templates=dict(
            job_root=nirvana.directories[0],
            tmp_root='//home/images/tmp'
        )
    )

    mydates = options.dates
    strdate = mydates[-1]
    str_rd = "{" + s_str(mydates) + ".." + strdate + "}"

    collections_log = job.table('//home/images/tmp/rkam/MMA-1607/data/collections-redir-log/1d/' + str_rd)
    other, finishcardcreate = collections_log.map(myMap_coll).split(nf.equals('path', "finish.card.create"))

    result = other.map(myMap_coll2) \
        .groupby('fielddate', 'actiond', 'actiontype', 'device') \
        .aggregate(count = na.sum('count')) \
        .project(ne.all(), service = ne.const("collections"))

    for d in [(datetime.datetime.strptime(strdate,'%Y-%m-%d') - datetime.timedelta(days=x)).strftime('%Y-%m-%d') for x in range(0, 31)]:
        result.filter(nf.equals('fielddate', d)).put('$job_root/collections/' + d)

    cards_raw = job.table('//home/images/tmp/rkam/MMA-1597/collections/cards_dump') \
        .project(ne.all(), d = ne.const(str(strdate))) \
        .map(filter_dates) \
        .unique('fielddate', 'path', 'cardId', 'is_private') \
        .project(ne.all(), actiond='path', device=ne.const("desktop"), service=ne.const("collections"),
                 actiontype=ne.custom(card_status, 'is_private').add_hints(type=bool))

    cards, cards2 = cards_raw.split(nf.and_(
            nf.equals('source_type', 'image'),
            nf.not_(sf.defined('doc_id')),
            nf.equals('is_private', False)
        ))
    cards3 = cards2.join(finishcardcreate, by='cardId')

    finishcardcreate.put('$job_root/collections/finishcardcreate')

    cards_0911, cards_0912 = cards.split(nf.custom(filter0912, 'fielddate'))
    cards_status = cards_0911.filter(nf.not_(nf.and_(nf.equals("source_type", 'video'), sf.defined("doc_id"))))

    tous_ensemble = job.concat(cards_status, cards_0912, cards3)

    tous_ensemble.groupby('fielddate', 'actiond', 'actiontype', 'device', 'service') \
        .aggregate(count = na.count()) \
        .put('$job_root/collections/cards30d')

    tous_ensemble.project('fielddate', 'board_id').unique('fielddate', 'board_id').put('$job_root/collections/valid_board_ids')

    enresult = cards.filter(nf.or_(nf.equals('source_type', 'series'), nf.equals('source_type', 'film'))) \
        .groupby('fielddate', 'actiond', 'actiontype', 'device', 'service') \
        .aggregate(count = na.count()) \
        .put('$job_root/collections/entity_cards30d')

    return job


if __name__ == '__main__':
    cli.run()

