#!/usr/bin/env python
# -*- coding: utf-8 -*-

from nile.api.v1 import (
    Record,
    files,
    clusters,
    cli,
    files as nfl,
    with_hints,
    extended_schema,
    multischema,
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns #obligatory for Statface
)
from qb2.api.v1 import (
    QB2,
    filters as sf,
    extractors as se,
    resources as sr
)
from qb2.api.v1.typing import Optional, Json, String

import os #obligatory for Statface
import sys #obligatory for Statface
import re #obligatory for Statface
import argparse #obligatory for Statface
import datetime
import time
import re
import ast
import urllib
import itertools

# https://hitman.yandex-team.ru/projects/edinoe_izbrannoe/EI_Stats_MMA_1923
# Отличия от v1: нет двойного учета последнего события, починены рефереры для хардбитов на Эфире/Видео,
#  исправлены счетчики videohub, выбрасывать тех-события и show-события из баобабных,
#  global-notifications учитываюся как Zen, ограничение на длину сессии 30*60 (не учитывать
#  ушедших неизвестно куда пользователей), "unknown" в service_map

@with_hints(
    output_schema=dict(
        page_from=str,
        page_to=str,
        ui=str,
        notif_type=str,
        clong=int,
        overlong=int
    )
)
def add_totals2(recs):
    for rec in recs:
        for pair in itertools.product(
                (rec.page_from, '_total_'),
                (rec.page_to, '_total_'),
                (rec.ui, '_total_'),
                (rec.notif_type, '_total_'),
                (rec.clong, ),
                (rec.overlong, ),
                (rec.ticker, )
                ):
            yield Record(
                page_from = pair[0],
                page_to = pair[1],
                ui = pair[2],
                notif_type = pair[3],
                clong = pair[4],
                overlong = pair[5],
                ticker = pair[6]
                )

def url_to_ui(r):
    url = r.split("?")
    if "/touch" in url[0]:
        ui = "touch"
        if "searchapp" in url[0] or (len(url) > 1 and "mobileapp" in url[1]):
            ui = "app"
    elif "/pad" in url[0] or "/tablet" in url[0]:
        ui = "pad"
    else:
        ui = "desktop"
    return ui

def page_to_service(p):
    try:
        referer_rep = p.replace("https://", "").replace("www.", "").replace("m.yandex.", "yandex.")
        currect_action_sp = referer_rep.split("||", 1)
        host_sp = currect_action_sp[0].split(".")
        if host_sp[0] != "yandex" and len(host_sp) > 1:
            currect_action = host_sp[0]
        else:
            if currect_action_sp[1]=="/":
                currect_action = "morda"
            else:
                currect_action = currect_action_sp[1].split("/")[1]
                if currect_action == "instant" or "tv" in currect_action:
                    currect_action = None
                elif currect_action in ["yandsearch", "touchsearch"]:
                    currect_action = "search"
                elif currect_action == "gorsel":
                    currect_action = "images"
                elif currect_action == "themes":
                    currect_action = "morda"
                elif currect_action == "ugcpub":
                    currect_action = "ugc"
                elif currect_action == "local":
                    currect_action = "district"
                # global-notifications --> zen
                elif currect_action == "global-notifications":
                    currect_action = "zen"
                elif currect_action == "" or currect_action == "d":
                    currect_action = "unknown"
        return currect_action
    except:
        return None


@with_hints(
    output_schema=multischema(
        dict(uid=str, timestamp=int, referer=str, ui=str, pid=str, path=str, notif_type=str, notif_service=str),
        dict(uid=str, timestamp=int, referer=str, ui=str, path=str)
    )
)
def process_redir_data4(recs, only900, actions):
    for rec in recs:

        path = rec.normal_path
        uid = rec.yandexuid
        notif_type, notif_service = None, None

        try:
            if rec.pid == "900":
                referer = rec.canonized_vhost + "||" + rec.page
            else:
                referer = rec.referer_canonized_vhost + "||" + rec.referer_page
            if "hamster" in referer or "priemka" in referer or "clck" in referer or "-test" in referer or "staff" in referer:
                continue
        except:
            referer = ""

        ui = url_to_ui(referer)

        if rec.pid == "900":
            vrsr = rec.parsed_vars
            if vrsr is not None:
                if "-notifications" in vrsr:
                    vrs = vrsr["-notifications"]
                    vrs1 = urllib.unquote(vrs).decode('utf8')
                    try:
                        vrs2 = ast.literal_eval(vrs1)
                    except:
                        continue

                    for vrs_prs in vrs2:
                        notif_type, notif_service = "unknown", "unknown"
                        recordId = None
                        if "type" in vrs_prs:
                            notif_type = vrs_prs["type"].replace("_male","").replace("_female","")
                        if "service" in vrs_prs:
                            notif_service = vrs_prs["service"]
                        if "recordId" in vrs_prs:
                            recordId = vrs_prs["recordId"]

                        #if notif_service == "tracker":
                        #    continue

                        #elif "global-notifications" in notif_service:
                        #    notif_service = "unknown"

                        only900( Record(uid=uid, timestamp=rec.timestamp, ui=ui, referer=referer, path=path,
                                pid=rec.pid, notif_type=notif_type, notif_service=notif_service, recordid=recordId) )

                else:
                    notif_type, notif_service = "unknown", "unknown"
                    recordid = None

                    # click
                    if "-service" in vrsr and "-type" in vrsr:
                        notif_service = urllib.unquote(vrsr["-service"]).decode('utf8')
                        notif_type = urllib.unquote(vrsr["-type"]).decode('utf8')
                        try:
                            recordid = urllib.unquote(vrsr["-record_id"]).decode('utf8')
                        except:
                            pass

                    # open
                    elif "-settingId" in vrsr and "-recordId" in vrsr:
                        notif_service = vrsr["-recordId"]
                        notif_type = vrsr["-settingId"]

                    notif_type = notif_type.replace("_male","").replace("_female","")

                    #if notif_service == "tracker":
                    #    continue

                    only900( Record(uid=uid, timestamp=rec.timestamp, ui=ui, referer=referer, path=path,
                            pid=rec.pid, notif_type=notif_type, notif_service=notif_service, recordid=recordid) )

            else:
                only900( Record(uid=uid, timestamp=rec.timestamp, ui=ui, referer=referer, path=path,
                            pid=rec.pid, notif_type='unknown', notif_service='unknown', recordid=None) )

        if path == "notifier.results.show":
            notif_type, notif_service = None, None

        elif "player-events." in path or "videohub" in path or rec.pid=="197":
            referer = "yandex.ru||/video/search"
        elif rec.pid=="40":
            referer = "yandex.ru||/images/search"

        actions( Record(uid=uid, timestamp=rec.timestamp, ui=ui, referer=referer, path=path,
                        notif_service=notif_service, notif_type=notif_type) )


@with_hints(
    files=[nfl.TableFile('$job_root/services', 'services')],
    output_schema=dict(
        uid=str,
        ui=str,
        path=str,
        notif_type=str,
        notif_service=str,
        service=str,
        timestamp=int,
        splitter=str
    )
)
def service_map(recs, **options):
    file_streams = options['file_streams']
    services_list = [x.n_service for x in file_streams['services'] if x.get('n_service')]
    services_list2 = [k for k in services_list if "notifications" not in k and "clck" not in k and "hamster" not in k and "priemka" not in k and "test" not in k and "unknown" not in k and "staff" not in k]

    for rec in recs:
        for s in services_list2:
            service = rec.service
            if "hamster" in service or "priemka" in service or "clck" in service:# or "global-notifications" in service:
                continue
            yield Record(uid=rec.uid, notif_service=rec.notif_service, notif_type=rec.notif_type, path=rec.path, service=service, timestamp=rec.timestamp, ui=rec.ui, splitter=s)


@with_hints(
    output_schema=dict(
        uid=str,
        ui=str,
        notif_type=str,
        page_from=str,
        page_to=str,
        clong=str,
        overlong=int,
        ticker=int
    )
)
def myreduce_v3(recs):
    for key, records in recs:

        uid = key.uid
        j = key.splitter

        prev_path, prev_action, prev_ts, notif_service = None, None, None, None
        has_ticker, ticker_ts = 0, None

        for rec in records:
            currect_action = rec.service
            if j in currect_action or currect_action in j:
                continue
            current_path = rec.path

            if current_path in ["notifier.results.mark_read", "notifier.results.kebab.read"]:
                continue
            currect_ts = rec.timestamp

            if current_path == "ticker":
                has_ticker = 1
                ticker_ts = currect_ts

            if prev_path is not None:
                if prev_path in ["notifier.results.click"] and notif_service == j and notif_type is not None:
                    dw = currect_ts - prev_ts
                    if ticker_ts is not None and (currect_ts - ticker_ts) > 30*60:
                        has_ticker = 0
                    if dw > 30*60:
                        pass
                    elif dw > 120:
                        yield Record(ui=rec.ui, uid=uid, notif_type=notif_type, page_from=prev_action, page_to=notif_service,
                            clong=1, overlong=1, ticker=has_ticker)
                    elif dw > 30:
                        yield Record(ui=rec.ui, uid=uid, notif_type=notif_type, page_from=prev_action, page_to=notif_service,
                            clong=1, overlong=0, ticker=has_ticker)
                    else:
                        yield Record(ui=rec.ui, uid=uid, notif_type=notif_type, page_from=prev_action, page_to=notif_service,
                            clong=0, overlong=0, ticker=has_ticker)

            prev_action = currect_action
            prev_ts = currect_ts
            prev_path = current_path
            notif_service = rec.notif_service
            notif_type = rec.notif_type

        if prev_path in ["/notifier/results/click", "notifier.results.click"] and notif_service == j and notif_type is not None:
            if ticker_ts is not None and (currect_ts - ticker_ts) > 30*60:
                has_ticker = 0
            yield Record(ui=rec.ui, uid=uid, notif_type=notif_type, page_from=prev_action, page_to=notif_service,
                    clong=0, overlong=0, ticker=has_ticker)


def parse_bao_events(norm_p, events):
    res = None
    try:
        r = ast.literal_eval(urllib.unquote(events))
        if "event" in r[0]:
            res = r[0]["event"]
            if "parent-path" in r[0]:
                if res=="click" and r[0]["parent-path"] == "videohub":
                    res = res + "_videohub"
    except:
        try:
            res = norm_p
        except:
            pass
    return res

def parse_ticker_events(events):
    try:
        r = ast.literal_eval(urllib.unquote(events))
        if not "type" in r[0]:
            return False
        elif not r[0]["type"] == 'ticker':
            return False
        elif not "notifierTickerValue" in r[0]["data"]:
            return False
        else:
            return True
    except:
        return False


# https://clubs.at.yandex-team.ru/yt/2642
@cli.statinfra_job

def make_job(job, nirvana, statface_client, options):

    job = job.env(
        yt_spec_defaults=dict(pool_trees=["physical"], tentative_pool_trees=["cloud"]),
        templates=dict(
            job_root=nirvana.directories[0]
        )
    )

    report2new = ns.StatfaceReport() \
        .path('Notifier/Counters/NotifierClicksStatsV2Conversion_v2') \
        .scale('daily') \
        .client(statface_client)

    mydates = options.dates

    for strdate in mydates:

        coll_data = job.table('//home/logfeller/logs/collections-redir-log/1d/' + strdate) \
            .qb2(log = 'redir-log',
                fields=['yandexuid', 'timestamp', 'normal_path', 'pid', 'url', 'parsed_vars',
                        'page', 'canonized_vhost', 'referer_page', 'referer_canonized_vhost',
                        se.log_field('events')],
                filters = [sf.default_filtering('redir-log'),
                        sf.defined('yandexuid', 'referer'), sf.not_(sf.contains('normal_path', 'tech.'))],
                mode='yamr_lines')

        answ_data = job.table('//home/logfeller/logs/answ-redir-log/1d/' + strdate) \
            .project("yandexuid", "timestamp", normal_path="path", pid=ne.const("znatoki"),
                page="url", canonized_vhost=ne.const(""),
                referer_page="/znatoki", referer_canonized_vhost=ne.const("yandex.ru")
            )

        data_raw_extra = coll_data.concat(answ_data)
        data_raw = job.table('//tmp/rkam/data_qb2_'+strdate).concat(data_raw_extra)

        """
        standard, ticker = data_raw.split(sf.and_(sf.defined('events'), nf.custom(parse_ticker_events, 'events')))

        s1 = standard.filter(sf.defined('normal_path'))
        s2 = ticker.project(ne.all(exclude='events'), normal_path = ne.const("ticker"))

        log900, data = s1.concat(s2).map(process_redir_data4)
        """

        log900, data = data_raw.project(ne.all(exclude='normal_path'),
                     normal_path_events=ne.custom(parse_bao_events, 'normal_path', 'events').add_hints(type=str)) \
            .project(ne.all(exclude='normal_path_events'), normal_path='normal_path_events') \
            .filter(sf.and_(sf.defined('normal_path'),
                sf.not_(sf.contains('normal_path', 'tech.')),
                sf.not_(sf.equals('normal_path', 'tech')),
                sf.not_(sf.equals('normal_path', 'show'))
                )
            ).map(process_redir_data4)

        notif_uids = job.table('$job_root/notif_uids_' + strdate)

        p1 = data.project(ne.all(exclude='ui')).join(notif_uids, by='uid')
        p2 = p1.project(ne.all(), service=ne.custom(page_to_service, 'referer').add_hints(type=str))

        p4 = p2.filter(sf.defined('service')).map(service_map) \
            .groupby('uid', 'splitter').sort('timestamp') \
            .reduce(
                myreduce_v3,
                memory_limit=4000,
                intensity='data'
            )

        longclicks = p4.map(add_totals2) \
            .groupby('ui', 'page_from', 'page_to', 'notif_type') \
            .aggregate(hits=na.count(), hits_long=na.sum('clong'), hits_overlong=na.sum('overlong'), ticker=na.sum('ticker')) \
            .filter(sf.defined('page_from', 'page_to', 'notif_type')) \
            .project(ne.all(), fielddate=ne.const(strdate)) \
            .put("$job_root/user_actions_v2_" + strdate).publish(report2new, allow_change_job=True)

    return job


if __name__ == '__main__':
    cli.run()

