#-*- coding: UTF-8 -*-
from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)

from qb2.api.v1 import extractors as se, filters as sf
import urlparse
import nile
from datetime import datetime

def parse_us(groups):
    import libra

    object_ids = {'kin01044487':'домашний арест',
                  'kin0893836':'мистические истории',
                  'kin01115041':'вне игры'
                 }
    kp_ids = {'kin01044487':'домашний арест',
              'kin01044487':'мистические истории',
              'kin01115041':'вне игры'
             }

    serials = ['домашний арест', 'мистические истории', 'осторожно земляне', 'вне игры']
    stop_words = ['квн','tanks','танки','marvel','женщины вне игры']

    hosts = ['kinopoisk.ru',
             'russia.tv','newstube.ru','tvrain.ru',
             'megogo.net','matchtv.ru','1tv.ru','ren.tv',
             ]

    blach_hosts = ['cinema.club','kinoflux.org','serials.one','hdrezka','serialtv3.ru','hd-1080.com',
                  ]

    urls = ['frontend.vh','wikipedia.org']
    channel_urls = ['tnt-premier.ru','tnt-online.ru','tv3.ru','2x2tv.ru','ctc.ru','start.ru']

    for key,recs in groups:
        uid = key.key

        try:
            s = libra.ParseSession(recs, './blockstat.dict')
        except Exception as e:
            continue

        for r in s:
            if r.IsA('TYandexVideoRequest') or r.IsA('TYandexRelatedVideoRequest'):
                ui = 'desktop video'
            elif r.IsA('TTouchYandexVideoRequest') or r.IsA('TTouchYandexRelatedVideoRequest'):
                ui = 'touch video'
            elif r.IsA('TPadYandexVideoRequest') or r.IsA('TPadYandexRelatedVideoRequest'):
                ui = 'pad video'
            elif r.IsA('TMobileAppYandexVideoRequest') or r.IsA('TMobileAppYandexRelatedVideoRequest'):
                ui = 'app video'
            elif r.IsA('TVideoRequestProperties'):
                ui = 'other video'
            elif r.IsA('TYandexWebRequest'):
                ui = 'desktop web'
            elif r.IsA('TTouchYandexWebRequest'):
                ui = 'touch web'
            elif r.IsA('TPadYandexWebRequest'):
                ui = 'pad web'
            elif r.IsA('TMobileAppYandexWebRequest'):
                ui = 'app web'
            else:
                continue

            q = str(r.Query).lower()
            reqid = r.ReqID

            spv = r.SearchPropsValues
            relev = r.RelevValues

            sertitle = str(relev.get('vsertitle'))
            if sertitle in serials:
                tnt = 1
            else:
                tnt = 0

            marker_q = 0
            for ser in serials:
                if ser in q:
                    marker_q = 1
                    break

            for st in stop_words:
                if st in q:
                    marker_q = 0

            if 'video' in ui:
                object_id = str(spv.get('UPPER.VideoExtraItems.object_id'))
                kp_id = str(spv.get('REPORT.winner_ids'))

                obj_serial = object_ids.get(object_id)
                kp_serial = kp_ids.get(kp_id)

                if not obj_serial and not kp_serial and tnt == 0 and marker_q == 0:
                    continue

                for bl in r.GetMainBlocks():
                    result = bl.GetMainResult()
                    if not result.IsA("TVideoResult"):
                        continue

                    url = str(result.Url)

                    if url.replace('https://','').replace('http://','').replace('/','').replace('www.','') in channel_urls:
                        continue

                    wu = 0
                    for u in urls:
                        if u in url:
                            wu = 1
                            break

                    if wu == 1:
                        continue

                    try:
                        host = urlparse.urlparse(url).netloc
                    except:
                        host = url

                    black = 0
                    for h in blach_hosts:
                        if h in host:
                            black = 1
                            break

                    if 'misticheskie-istorii' in url:
                        black = 1

                    if host.replace('www.','') in hosts:
                        continue

                    duration = r.FindVideoDurationInfo(result)
                    if (duration):
                        dt = min(duration.PlayingDuration, duration.Duration)
                        dur = max(duration.PlayingDuration, duration.Duration)
                    else:
                        dt = 0
                        dur = 0

                    heartbeat = r.FindVideoHeartbeat(result, 'ANY')
                    if (heartbeat):
                        ht = heartbeat.Ticks
                    else:
                        ht = 0

                    dur = result.Duration
                    if dur >= 60*12:
                        black = 1

                    res_tvt = max(dt,ht)

                    yield Record(ui=ui,url=url,host=host,tvt=res_tvt,black=black,
                                 clicks=0,clicks120=0,q=q,obj_serial=obj_serial,
                                kp_serial=kp_serial,uid=uid,sertitle=sertitle)

            else:
                object_id = str(spv.get('UPPER.EntitySearch.Ontoid'))
                obj_serial = object_ids.get(object_id)

                if not obj_serial and tnt == 0 and marker_q == 0:
                    continue

                for bl in r.GetMainBlocks():
                    result = bl.GetMainResult()
                    if not result.IsA('TWebResult'):
                        continue

                    url = str(result.Url)
                    if url.replace('https://','').replace('http://','').replace('/','').replace('www.','') in channel_urls:
                        continue

                    wu = 0
                    for u in urls:
                        if u in url:
                            wu = 1
                            break

                    if wu == 1:
                        continue

                    try:
                        host = urlparse.urlparse(url).netloc
                    except:
                        host = url

                    black = 0
                    for h in blach_hosts:
                        if h in host:
                            black = 1
                            break
                    if 'misticheskie-istorii' in url:
                        black = 1

                    if host.replace('www.','') in hosts:
                        continue

                    if 'yandex' in host:
                        continue

                    clicks = 0
                    clicks120 = 0
                    for cl in bl.GetClicks():
                        clicks += 1
                        dw = cl.DwellTimeOnService
                        if dw > 120:
                            clicks120 += 1

                    yield Record(ui=ui,url=url,host=host,tvt=0,clicks=clicks,clicks120=clicks120,black=black,
                                 q=q,obj_serial=obj_serial,uid=uid,sertitle=sertitle,kp_serial='-')

def main():
    USER_SESSIONS_PREFIX = 'user_sessions/pub/search/fast'
    CANDIDATES_PREFIX = '//home/search-research/ensuetina/TNT_URLS'
    BAD_PLAYER_ID_HOSTS = '//home/search-research/ensuetina/plid_aggr_bad'
    YANG_CANDIDATES = '//home/search-research/ensuetina/TNT_URLS/yang_candidates'
    VIDEO_CANDIDATES_BY_DURATION = '//home/search-research/ensuetina/TNT_URLS/yang_candidates_video'
    ASSESSORS_MARKS = '//home/search-research/ensuetina/TNT_URLS/assesors_marks'

    cluster = clusters.yt.Hahn().env(templates=dict(job_root=CANDIDATES_PREFIX),
                                  parallel_operations_limit=10,
                                  yt_spec_defaults=dict(
                                    pool_trees=["physical"],
                                    tentative_pool_trees=["cloud"]
                                 ))
    last_calculated_ts = cluster.driver.client.get_attribute(CANDIDATES_PREFIX + '/all_urls_fast', '_last_calculated_ts', 0)
    print last_calculated_ts
    for ts in sorted(cluster.driver.list(USER_SESSIONS_PREFIX))[-10:]:
        if int(ts) > int(last_calculated_ts) and cluster.driver.exists(USER_SESSIONS_PREFIX + '/' + ts + '/clean'):
            job = cluster.job()

            us = job.table(USER_SESSIONS_PREFIX + '/' + ts + '/clean')

            urls = us.groupby('key').sort('subkey').reduce(parse_us,
                                                               files=[nile.files.RemoteFile('statbox/statbox-dict-last/blockstat.dict'),
                                                                      nile.files.RemoteFile('statbox/resources/libra.so') ],
                                                               memory_limit=4000
                                                              ).put('$job_root/all_urls_fast')

            t1 = urls.filter(sf.custom(lambda x: x>0,'tvt')
                                 ).groupby('host','url'
                        ).aggregate(shows=na.count(),
                                    tvt=na.sum('tvt'),
                                    clicks=na.sum('clicks'),
                                    clicks120=na.sum('clicks120'),
                                    sertitle=na.any('sertitle'),
                                    obj_serial=na.any('obj_serial'),
                                    q=na.any('q'),
                                    tvt_per_Watch=na.mean('tvt')
                                   ).project(ne.all(),
                                             neg=ne.custom(lambda x: -1*x,'tvt')
                                            ).sort('neg').put('$job_root/aggr_urls_with_Tvt'
                                                             ).filter(sf.custom(lambda x: x>600,'tvt_per_Watch'))

            t2 = urls.filter(sf.custom(lambda x,y: x == 0 and y > 5,'tvt','clicks120')
                            ).groupby('host','url'
                        ).aggregate(shows=na.count(),
                                    tvt=na.sum('tvt'),
                                    clicks=na.sum('clicks'),
                                    clicks120=na.sum('clicks120'),
                                    sertitle=na.any('sertitle'),
                                    obj_serial=na.any('obj_serial'),
                                    q=na.any('q')
                                   ).project(ne.all(),
                                             neg=ne.custom(lambda x: -1*x,'clicks120')
                                            ).sort('neg').put('$job_root/aggr_urls_with_clicks120')

            t3 = urls.filter(sf.custom(lambda x,y,z: x==0 and y <= 5 and z == 1,'tvt','clicks120','black')
                            ).groupby('host','url'
                        ).aggregate(shows=na.count(),
                                    tvt=na.sum('tvt'),
                                    clicks=na.sum('clicks'),
                                    clicks120=na.sum('clicks120'),
                                    sertitle=na.any('sertitle'),
                                    obj_serial=na.any('obj_serial'),
                                    q=na.any('q')
                                   ).project(ne.all(),
                                             neg=ne.custom(lambda x: -1*x,'tvt')
                                            )

            t4 = urls.join(job.table(BAD_PLAYER_ID_HOSTS), by='host', type='inner') \
                     .groupby('host','url'
                        ).aggregate(shows=na.count(),
                                    tvt=na.sum('tvt'),
                                    clicks=na.sum('clicks'),
                                    clicks120=na.sum('clicks120'),
                                    sertitle=na.any('sertitle'),
                                    obj_serial=na.any('obj_serial'),
                                    q=na.any('q')
                                   ).project(ne.all(),
                                             neg=ne.custom(lambda x: -1*x,'tvt')
                                            )

            t5 = job.concat(job.table(YANG_CANDIDATES), job.table(VIDEO_CANDIDATES_BY_DURATION)).project(ne.all(),
                                                    shows=ne.const(0), tvt=ne.const(0),
                                                    clicks=ne.const(0), sertitle=ne.const("yang"),
                                                    obj_serial=ne.const("no"), q=ne.const("no"), neg=ne.const(0))

            candidates = job.table('$job_root/candidates')
            job.concat(t1, t2, t3, t4, t5, candidates
                      ).groupby('url','host'
                               ).aggregate(shows=na.sum('shows'),
                                    tvt=na.sum('tvt'),
                                    clicks=na.sum('clicks'),
                                    clicks120=na.sum('clicks120'),
                                    sertitle=na.any('sertitle'),
                                    obj_serial=na.any('obj_serial'),
                                    q=na.any('q'),
                                    neg=na.sum('neg')
                                   ).sort('neg').put('$job_root/candidates').join(
                       job.table(ASSESSORS_MARKS), type='left', by='url').filter(nf.or_(nf.not_(sf.defined("result")),
                                                                 nf.and_(sf.defined("result"), sf.equals("result", "ban")))).put('$job_root/candidates_filtered')

            job.run()
            cluster.driver.client.set_attribute(CANDIDATES_PREFIX + '/all_urls_fast', '_last_calculated_ts', ts)

if __name__ == "__main__":
    main()
