#!/usr/bin/env python
# -*- coding: utf-8 -*-

# https://st.yandex-team.ru/FUNCTIONALITY-2004
# Calculate Fenerbahçe browser installation statistics.

import os
import sys
from functools import wraps

import yt.wrapper as yt
import logging

_print = logging.getLogger("fenestats")

yt.config["proxy"]["url"] = "plato.yt.yandex.net"
if not os.isatty(sys.stderr.fileno()):
    yt.config["operation_tracker"]["stderr_logging_level"] = "ERROR"
    logging.getLogger("Yt").setLevel(level=logging.ERROR)
    _print.setLevel(level=logging.WARN)


from nile.api.path import Date, DateRange
from nile import Record
from nile import clusters, aggregators as na, extractors as ne, filters as nf
import datetime
import time
import urlparse

dsv = yt.format.DsvFormat()

test_banerids = set((("0101910664"), ("0101910627"), ("8186690025"), ("0101040674"), ("0201540200")))

def to_iso_8601(timestamp):
    iso_format = '%Y-%m-%dT%H:%M:%S'
    try: 
        return time.strftime(iso_format, time.strptime(timestamp, '%d/%b/%Y:%H:%M:%S'))
    except:
        return time.strftime(iso_format, time.strptime(timestamp, iso_format))

def parse_export_timestamp(ts):
    iso_format = '%Y-%m-%dT%H:%M:%S'
    try:
        return time.mktime(time.strptime(ts, '%d/%b/%Y:%H:%M:%S'))
    except ValueError:
        return time.mktime(time.strptime(ts, iso_format))

def to_nile_mapper(mapper):
    @wraps(mapper)
    def nj(records):
        for r in records:
            for d in mapper(records):
                yield Record(**d)
    return nj

def query_params(query_string):
    p = urlparse.parse_qs(query_string)
    return dict(((k, v[0]) for k, v in p.iteritems()))

def compactify_export(record):
    qs = record.get('request', '').partition('?')[2]
    params = query_params(qs)
    if params.get('yasoft') != 'yabrowser':
        return
    ui = params.get('ui', '').strip('{}')
    if not ui:
        return
    stat = params.get('stat', '')
    banerid = params.get('banerid', '')
    good_banerid = banerid.startswith('60')
    if not (good_banerid or stat.endswith('install')):
        return
    ts = to_iso_8601(record['timestamp'])
    yandexuid = params.get('yandexuid')
    yield dict(ts=ts, banerid=banerid, stat=stat, ui=ui, yandexuid=yandexuid)


def banerid(record):
        row = dsv.loads_row(record['value'])
        qs = row.get('request', '').partition('?')[2]
        ts = parse_export_timestamp(row['timestamp'])
        params = urlparse.parse_qs(qs)
        banerid = params.get('banerid', [''])[0]
        if not (banerid.startswith('60') or banerid in test_banerids):
            return
        stat=params.get('stat', [''])[0]
        if banerid and stat == 'install':
            yield dict(banerid=banerid,
                    stat=stat,
                    timestamp=ts,
                    ui=params.get('ui', [''])[0].strip('{}'),
            )

def run_banerid(table):
    return table.map(banerid)

def barnavig(records):
    for record in records:
        row = dsv.loads_row(record['value'])
        params = urlparse.parse_qs(row.get('http_params', ''))
        ui = params.get('ui', [None])[0]
        if ui:
            ret = Record(ui=ui, timestamp=row.get('unixtime'))
            for k in ('yandexuid', 'fyandexuid', 'ip'): # , fyandexuid,
                v = row.get(k)
                if v:
                    setattr(ret, k, v)
            yield ret

def uniq_yandexuids(table):
    return table.map(barnavig).unique('ui', 'yandexuid', 'fyandexuid').sort('ui')

def count_searches(sessions):
    return (sessions.filter(
                nf.custom(lambda key: key.startswith('y'), 'key'),
                nf.custom(lambda value: value.startswith('type=REQUEST'), 'value'))
            .groupby('key').aggregate(hits=na.count()))

def count_fb_searches(counts, yandexuids):
    job = counts.job
    yuids = job.table(yandexuids).project('ui', 'banerid', yu=ne.custom(
        lambda uid: 'y' + uid, 'yandexuid'))
    return (yuids.join(counts, by_left='yu', by_right='key', type='inner',
                assume_unique_left=True)
            .groupby('ui').aggregate(hits=na.sum('hits')))

def ui_to_yu(key, records):
    yus = set()
    banerids = set()
    for r in records:
        if 'yandexuid' in r:
            yus.add(r['yandexuid'])
        elif 'banerid' in r:
            banerids.add(r['banerid'])
    if banerids:
        for yu in yus:
            yield {'ui': key['ui'], 'banerid': ','.join(banerids), 'yandexuid': yu}

def daily_since(date):
    return DateRange(Date(date), Date(datetime.date.today()))

def days_ago(n):
    return Date(datetime.date.today() - datetime.timedelta(days=n))

def nonempty(table):
    return yt.exists(table) and yt.get_attribute(table, 'row_count') > 0

def remake_map(operation, src, dst, sort_by=[], nile_job=None, job_args=[],
        remove_source=False):
    _print.info('remake_map: %s -> %s', src, dst)
    if nonempty(src) and not yt.exists(dst):
        with yt.Transaction():
            if nile_job:
                operation(nile_job.table(src), *job_args).put(dst)
                nile_job.run()
            else:
                yt.run_map(operation, src, dst)
            if sort_by:
                yt.run_sort(dst, sort_by=sort_by)
            if remove_source:
                yt.remove(src)
    if yt.exists(dst):
        return [dst]
    return []

def with_date(pattern, date):
    return pattern.format(date=date)

def accumulate(operation, in_pattern, out_pattern, *args):
    all_cumulative = sorted(yt.list(out_pattern.format(date='').rstrip('/')))
    all_daily = sorted(yt.list(in_pattern.format(date='').rstrip('/')))
    latest_daily = all_daily[-1]
    if all_cumulative:
        latest_cumulative = all_cumulative[-1]
        new_daily = [date for date in all_daily
                if date > latest_cumulative]
        if not new_daily:
            return out_pattern.format(date=latest_cumulative)
        sources = [with_date(out_pattern, latest_cumulative)] + [
                with_date(in_pattern, n) for n in new_daily]
    else:
        sources = [with_date(in_pattern, d) for d in all_daily]
    print sources, '-> cumulative', latest_daily
    operation(sources, out_pattern.format(date=latest_daily), *args)
    return out_pattern.format(date=latest_daily)

def banerid_freshest(input_paths, output_path, job):
    tables = [job.table(p) for p in input_paths]
    job.concat(*tables).groupby('ui').top(1, 'timestamp').put(output_path)
    job.run()

def aggregate_5min_op(key, records):
    clear_state = {'ts': '0', 'yandexuids': []}
    state = clear_state.copy()
    for r in records:
        if r['ts'] == '0':
            state = r
            continue
        yu = r.get('yandexuid')
        if yu:
            state['yandexuids'].append(yu)
        if r['stat'] == 'install':
            state['banerid'] = r['banerid']
            state['installation_ts'] = r['ts']
        elif r['stat'] == 'uninstall':
            state = clear_state.copy()
    state['yandexuids'] = list(set(state['yandexuids']))
    yield state


def aggregate_5min(input_paths, output_path):
    yt.run_reduce(aggregate_5min_op, input_paths, output_path, reduce_by=['ui'])
    yt.run_sort(output_path, sort_by=['ui', 'ts'])

def yuid_to_banerid_freshest(yuid_paths, output_path, job, banerid):
    tables = job.concat(*[job.table(p) for p in yuid_paths])
    (job.table(banerid)
            .join(tables, by='ui', type='inner', assume_unique_left=True)
            .project('banerid', 'yandexuid', 'ui')
            .unique('banerid', 'yandexuid').put(output_path))
    job.run()

def browser_stats(table, last7days, yandexuids):
    j = table.job
    return (j.table('//home/search-functionality/switch-tr/fb-searches/' + last7days)
            .project('ui', 'hits')
            .join(j.table(yandexuids), by='ui', type='inner', assume_unique=True)
            .groupby('ui', 'banerid').aggregate(days=na.count(), hits=na.sum('hits'))
            .filter(nf.custom(lambda x : x >= 3, 'days'))
            .project('banerid', 'hits', 'ui', activity = ne.custom(
                activity_by_hits, 'hits')))

def results(table):
    return (table
        .groupby('banerid')
        .aggregate(
            activity=na.histogram('activity', sorted=True),
            score=na.sum('activity'))
        .project('banerid', 'score',
            active_count=ne.custom(lambda hist: dict(hist).get(1, 0), 'activity'),
            super_count=ne.custom(lambda hist: dict(hist).get(2, 0), 'activity')))


def recalculate():
    import sys
    if len(sys.argv) > 1 and sys.argv[1] == '5min':
        yt.mkdir('//home/search-functionality/switch-tr/5min', recursive=True)
        exp_logs = yt.list('//home/search-functionality/statbox/cs-http/export-access-log')
        exp_logs.sort()
        if exp_logs:
            exp_logs.pop() # the last table is usually locked by the dumper
        for tbl in exp_logs:
            res_path = '//home/search-functionality/switch-tr/5min/' + tbl
            remake_map(compactify_export,
                '//home/search-functionality/statbox/cs-http/export-access-log/' + tbl,
                res_path,
                sort_by=['ui', 'ts'], remove_source=True)
        return

    if len(sys.argv) > 1 and sys.argv[1] == 'aggr5min':
        yt.mkdir('//home/search-functionality/switch-tr/5min-cum', recursive=True)
        print 'aggr5min'
        accumulate(aggregate_5min,
                '//home/search-functionality/switch-tr/5min/{date}',
                '//home/search-functionality/switch-tr/5min-cum/{date}')
        return

    banerid_tables = []
    yuid_tables = []
    yt.mkdir('//home/search-functionality/switch-tr/banerid', recursive=True)
    yt.mkdir('//home/search-functionality/switch-tr/banerid-cumulative', recursive=True)
    yt.mkdir('//home/search-functionality/switch-tr/yuids', recursive=True)
    yt.mkdir('//home/search-functionality/switch-tr/yuids-cumulative', recursive=True)
    yt.mkdir('//home/search-functionality/switch-tr/browser-stats', recursive=True)
    yt.mkdir('//home/search-functionality/switch-tr/results', recursive=True)
    cluster = clusters.Plato()
    for date in daily_since(days_ago(7)):
        banerid_tables += remake_map(banerid,
            '//statbox/export-access-log/{date}'.format(date=date),
            '//home/search-functionality/switch-tr/banerid/{date}'.format(date=date),
            sort_by=['ui'])
        yuid_tables += remake_map(uniq_yandexuids,
            '//statbox/bar-navig-decoded-bundle-log/{date}'.format(date=date),
            '//home/search-functionality/switch-tr/yuids/{date}'.format(date=date),
            sort_by=['ui'], nile_job=cluster.job())
    banerids = accumulate(banerid_freshest,
            '//home/search-functionality/switch-tr/banerid/{date}',
            '//home/search-functionality/switch-tr/banerid-cumulative/{date}',
            cluster.job())
    yandexuids = accumulate(yuid_to_banerid_freshest,
            '//home/search-functionality/switch-tr/yuids/{date}',
            '//home/search-functionality/switch-tr/yuids-cumulative/{date}',
            cluster.job(), banerids)
    for date in daily_since(days_ago(3)):
        searches = remake_map(count_searches,
            '//userdata/user_sessions/{date}'.format(date=date),
            '//home/search-functionality/switch-tr/searches/{date}'.format(date=date),
            nile_job=cluster.job())
        if searches:
            remake_map(count_fb_searches, searches[0],
            with_date('//home/search-functionality/switch-tr/fb-searches/{date}', date),
            nile_job=cluster.job(), job_args=[yandexuids])

    yesterday = days_ago(1)
    last7days = '{%s..%s}' % (days_ago(3), yesterday)
    j = cluster.job()

    def activity_by_hits(hits):
        if 15 <= hits < 50:
            return 1
        if hits >= 50:
            return 2
        return 0

    home = '//home/search-functionality/switch-tr/'

    remake_map(browser_stats,
        with_date(home + 'fb-searches/{date}', yesterday),
        with_date(home + 'browser-stats/{date}', yesterday),
        nile_job=cluster.job(), job_args=[last7days, yandexuids])

    remake_map(results,
        with_date(home + 'browser-stats/{date}', yesterday),
        with_date(home + 'results/{date}', yesterday),
        nile_job=cluster.job())


if __name__ == '__main__':
    recalculate()
