#!/usr/bin/env python
# coding: utf-8


from nile.api.v1 import clusters, Record, aggregators as na, filters as nf
from qb2.api.v1 import QB2, filters as sf
import jul
import datetime
from nile.api.v1.statface.client import StatfaceProductionClient, StatfaceBetaClient
from nile.api.v1.statface.report import StatfaceReport

CLUSTER_NAME = 'hahn'
# root_uuid = '//home/search-research/24julia/2035/uuids'
#ROOT = '//home/search-research/24julia/2035'
ROOT = '//home/suggest-dev/galamaj/2035'
TIME_TO_REPEAT = 4

#cluster = clusters.Hahn(pool='search-research_24julia').env(templates=dict(job_root=ROOT))
cluster = clusters.Hahn(pool='robot-suggest-dev').env(templates=dict(job_root=ROOT))
start_date = datetime.datetime.strptime(open('2035_prev_date.txt', 'r').read(), '%Y-%m-%d').date() \
             + datetime.timedelta(1)
start_date_str = start_date.strftime('%Y-%m-%d')
start_date_str_rep = (start_date - datetime.timedelta(TIME_TO_REPEAT)).strftime('%Y-%m-%d')
end_date_str = jul.last_yt_log_day(CLUSTER_NAME, 'cs')
end_date = datetime.datetime.strptime(end_date_str, '%Y-%m-%d').date()
if start_date_str > end_date_str:
    print "Bad dates: start date", start_date_str, '>', 'end date', end_date_str
    print 100/0
print start_date_str, end_date_str

yesterday = datetime.date.today() - datetime.timedelta(1)
distr_file = cluster.driver.client.read_file(
    '//statbox/statbox-dict-by-name/distr_report/'+yesterday.strftime('%Y-%m-%d'))
search_bar_clids = []
search_widget_clids = []
all_clids = {}
for rec in distr_file:
    values = rec.split('\t')
    all_clids[values[1]] = values
    if values[4] == '1002':  # or 'android' not in values[3]:
        search_bar_clids.append(values[1])
    if values[4] == '1003':
        search_widget_clids.append(values[1])
search_bar_clids = set(search_bar_clids)
search_widget_clids = set(search_widget_clids)


def search_lib_searches(records):
    for r in records:
        r = r.to_dict()
        soft = 'app' if r["browser"] in {"SearchApp", "YandexSearch"} else (
            'bro' if r["browser"] is None or not r["browser"].startswith('Ya') else 'ya_bro')
        url_params = r["parsed_parameters"]
        version = url_params.get('app_version', ['all'])[0]
        for domain in {r["canonized_vhost"].split('.')[-1], 'all'}:
            for clid_domain in {all_clids.get(r["clid"], ['', '', '', 'null', 'null'])[3], 'all'}:
                for clid_type in {all_clids.get(r["clid"], ['', '', '', 'null', 'null'])[4], 'all'}:
                    for soft_name in {soft, 'all'}:
                        for ver in {version, 'all', 'new' if version > '400' else 'old'}:
                            yield Record(uid=r["yandexuid"] if r["yandexuid"] else r.get('uuid'), clid=r["clid"], date=r["date"],
                                         domain=domain, vhost=r["canonized_vhost"], ver=ver,
                                         clid_domain=clid_domain, clid_type=clid_type,
                                         values=r, soft=soft_name,
                                         uuid=r["uuid"] if r["uuid"] else r["yandexuid"])


def join_with_old_data(groups):
    for keys, records in groups:
        recs = {}
        for r in records:
            recs[r.get('type', 'new')] = r
        res = recs.get('new', recs.values()[0])
        res = res.to_dict()
        res["type"] = 'old'
        yield Record(**res)

job = cluster.job(name='2035_serchlib_searches', uuid_by_name=True)
s1 = job.concat(*[job.table(jul.log_path(CLUSTER_NAME, 'cs', d))
                  for d in set(list(jul.date_range(start_date, end_date)) +
                               list(jul.date_range(start_date-datetime.timedelta(TIME_TO_REPEAT),
                                                   end_date-datetime.timedelta(TIME_TO_REPEAT))))])\
    .qb2(
        log='request-cube',
        fields=['clid', 'browser', 'parsed_parameters', 'canonized_vhost',
                'yandexuid', 'uuid', 'device_id', 'date', 'is_mobile', 'is_tablet'],
        filters=[sf.equals('is_good_search', True), sf.equals('is_robot', False),
                 sf.or_(sf.one_of('clid', search_bar_clids),
                        sf.and_(sf.contains('parsed_parameters', 'app'),
                                sf.custom(lambda x: 'lib' in x['app'], 'parsed_parameters')))])\
    .map(search_lib_searches)\
    .put('$job_root/tmp')\
    .groupby('date', 'clid_domain', 'clid_type', 'domain', 'soft', 'ver', 'uuid')\
    .aggregate(amount=na.count())\
    .put('$job_root/tmp2')\
    .groupby('date', 'clid_domain', 'clid_type', 'domain', 'soft', 'ver')\
    .aggregate(users_amount=na.count(), amount=na.sum('amount'))
s2 = job.concat(s1, job.table('$job_root/s2_daily_searches'))\
    .groupby('date', 'clid_domain', 'clid_type', 'domain', 'soft', 'ver')\
    .reduce(join_with_old_data)\
    .sort('date')\
    .put('$job_root/s2_daily_searches')
job.run()


def post_res_to_stat():
    records = cluster.read(ROOT[2:] + '/s2_daily_searches')
    f_list = {"fielddate", 'clid_domain', 'clid_type', 'domain', 'soft', 'ver',
              'amount', 'users_amount'}
    res = {}
    for elem in f_list:
        res[elem] = []
    for r in records:
        if not start_date_str_rep <= r.date <= end_date_str or len(r.ver) > 3:
            continue
        for field in f_list:
            value = getattr(r, field, 'empty')
            if not value:
                value = 'empty'
            if field == 'fielddate':
                value = r.date
            res[field].append(value)
    res = jul.reformat_to_stat(res)
    if res.find("fielddate=\t") != -1:
        print res
    try:
        jul.post_stat("Distribution", "Adhoc/searches_from_searchlib", res)
    except:
        print 'bad\n', res[:1000]
    #jul.post_stat("Distribution", "Adhoc/searches_from_searchlib", res)
post_res_to_stat()

########################################################################################################################
#
# client = StatfaceBetaClient(
#     username="robot_galamaj",
#     password="entI2At5cemro4e"
# )
#
# report = StatfaceReport().path('Adhoc/galamaj/searchlib_clids').scale('daily')
#     # задаем путь к отчету,
#     # .path('Adhoc/galamaj/distribution') \
#     # # дневную детализацию
#     # .scale('daily')
#     # и пару полей для перезаписи данных
#     # .replace_mask('fielddate', 'browser')
# report = report.client(client)
# cluster = clusters.Hahn()
# report = report.data(cluster.read(ROOT[2:] + '/s2_daily_searches'))
# print report
# report.publish()
# print "done"
#
########################################################################################################################

def uuid_history(groups):
    for keys, records in groups:
        #res = Record(uuid=keys.uuid, type='uuid_searches', info={}, uids=[])
        res = {"uuid":keys.uuid, "type":'uuid_searches', "info":{}, "uids":[]}
        if res["uuid"] is None:
            continue
        old = {}
        old_uids = []
        for r in records:
            if r.get('type', 'new') == 'uuid_searches':
                old = r.info
                old_uids = r.get('uids', [])
                continue
            if r.date not in res["info"]:
                res["info"][r.date] = 0
            if r.yandexuid and 'y' + r.yandexuid not in res["uids"]:
                res["uids"].append('y' + r.yandexuid)
            res["info"][r.date] += r.amount
        for day in old:
            if day not in res["info"]:
                res["info"][day] = old[day]
        for uid in old_uids:
            if uid not in res["uids"]:
                res["uids"].append(uid)
        res["searchlib_uuid"] = res["uuid"]
        yield Record(**res)

job = cluster.job(name='2035_serchlib_searches_by_uuid', uuid_by_name=True)
s1_stream = job.concat(*[job.table(jul.log_path(CLUSTER_NAME, 'cs', d))
                         for d in set(list(jul.date_range(start_date, end_date)) +
                                      list(jul.date_range(start_date-datetime.timedelta(TIME_TO_REPEAT),
                                                          end_date-datetime.timedelta(TIME_TO_REPEAT))))])\
    .qb2(
        log='request-cube',
        fields=['clid', 'browser', 'parsed_parameters', 'canonized_vhost',
                'yandexuid', 'uuid', 'device_id', 'date', 'is_mobile', 'is_tablet'],
        filters=[sf.equals('is_good_search', True), sf.equals('is_robot', False),
                 sf.or_(sf.one_of('clid', search_bar_clids),
                        sf.and_(sf.contains('parsed_parameters', 'app'),
                                sf.custom(lambda x: 'lib' in x['app'], 'parsed_parameters'))),
                 sf.not_(sf.one_of('browser', ("SearchApp", "YandexSearch")))])\
    .groupby('uuid', 'date', 'yandexuid')\
    .aggregate(amount=na.count())
# tabs = [job.table('$job_root/searches_by_searchlib_uuid'), s1_stream
#         ] if jul.exists_and_not_empty(root+'/searches_by_searchlib_uuid', cluster) else [s1_stream]
s2_stream = job.concat(job.table('$job_root/searches_by_searchlib_uuid'), s1_stream)\
    .groupby('uuid')\
    .reduce(uuid_history).put(
    '$job_root/searches_by_searchlib_uuid')
job.run()

log_file = open('2035_prev_date.txt', 'w')
log_file.write(end_date_str)
log_file.close()
