#!/usr/bin/env python
# -*- coding: utf-8 -*-

# https://st.yandex-team.ru/EXPERIMENTS-21325

from nile.api.v1 import (
    Record,
    files,
    clusters,
    cli,
    with_hints,
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns #obligatory for Statface
)
from qb2.api.v1 import (
    QB2,
    resources as sr,
    filters as qf
)

import os #obligatory for Statface
import sys #obligatory for Statface
import re #obligatory for Statface
import datetime
import time
import re

import uatraits
import random


EXPORT_PAGE = '//statbox/statbox-dict-last/export_page'


@with_hints(output_schema=dict(uid=str, testid=str, date=str))
def get_testids(recs):
    for rec in recs:

        t = rec.testids
        my_testids = t.replace(" ", "").split(",")

        value = rec.value
        testids = value.split("\t")

        for testid in testids:

            if testid in my_testids:
                key = rec.key
                if key.startswith('y'):
                    key = key.replace('y', '')
                else:
                    key = key.replace('uu/', '')

                yield Record(uid=key, testid=testid, date=rec.date)


@with_hints(
    output_schema=dict(
        uid=str,
        testid=str,
        page=str
        )
    )
def myMap_2(recs):
    for rec in recs:
        r = rec.value
        tt = rec.testids
        test_ids = tt.replace(" ", "").split(",")

        rr = dict(x.split('=', 1) for x in r.split('\t') if '=' in x)
        if not 'search_props' in rr:
            continue
        search_props = rr['search_props']
        if not 'test-ids=' in search_props:
            continue
        sp0 = search_props.split('test-ids=')[1]
        if not sp0.startswith(','):
            sp1 = sp0.split(',')[0]
            if ' ' in sp1:
                sp2 = sp1.split(' ')
                for testid in test_ids:
                    if testid in sp2:
                        serp_url = rr['serp_url']
                        if '?' in serp_url and "yandex." in serp_url:
                            page = ""
                            host = serp_url.split('?')[0]
                            page = host.split('yandex.')[1]
                        yield Record(uid = rr['uid'], testid=testid, page=page)
# serp_url=https://yandex.fr/images/search?callback=


@with_hints(
    output_schema=dict(
        page_id=str,
        click=int,
        show=int,
        cost=float,
        uid=str
    )
)
def parse_chevent_img(recs):
    pageId_list = []
    export_page_full = sr.get(sr.json('export_page'))
    for k, v in export_page_full.iteritems():
        if "Name" in v:
            if "images.yandex" in v["Name"] or "gorsel.yandex" in v["Name"]:
                pageId_list.append(k)
    for rec in recs:
        pageid = rec.pageid

        if pageid in pageId_list:

            devicetype = int(rec.devicetype)
            if devicetype < 4:
                devicetypestr = "touch"
            elif devicetype == 4:
                devicetypestr = "pad"
            else:
                devicetypestr = "desktop"

            countertype = rec.countertype
            if countertype == "2":
                click = 1
                show = 0
            else:
                click = 0
                show = 1

            eventcost = float(rec.eventcost)
            cost = click*eventcost*30/1000000
            yield Record(page_id=pageid,
                         click=click,
                         show=show,
                         cost=cost,
                         uid=str(rec.uniqid)
                         )

@with_hints(output_schema=dict(uid=str, bucket=int))
def gen_bucket(recs):
    for rec in recs:
        bucket = random.randrange(0, 100, 1)
        yield Record(uid = rec.uid,
                    testid = rec.testid)


def parse_from_path(s):
    splitted_s = s.split('/')
    table = splitted_s[-1]
    job_root = "/".join(splitted_s[0:-1])
    return [job_root, table]


@cli.statinfra_job(options=[cli.Option('test_ids', default='?')])

def make_job(job, nirvana, options):
    output_table = nirvana.output_tables[0]
    output_folder = parse_from_path(output_table)[0]
    table_name = parse_from_path(output_table)[1]

    job = job.env(
        yt_spec_defaults=dict(
            pool_trees=["physical"],
            tentative_pool_trees=["cloud"]
        ),
        templates=dict(
            job_root=output_folder,
            tmp_files=output_folder + "/temporary"
        )
    )

    testids = options.test_ids
    if testids == "?":
        print >> sys.stderr, 'wrong testids'
#    my_testids = testids.replace(" ", "").split(",")

#    uids = job.table('//logs/img-reqans-log/1d/@dates')\
#        .project(ne.all(), testids=ne.const(testids))\
#        .map(myMap_2)\
#        .unique('testid', 'uid', 'page')\
#        .put("$tmp_files" + "/" + table_name + "_0uids")

    tables = [job.table('//home/abt/yuid_testids/' + d.replace("-", "")).project(ne.all(), date=ne.const(d).add_hints(type=str)) for d in options.dates]

    tt = job.concat(*tables)
    uids = tt.project(ne.all(), testids = ne.const(testids)) \
        .map(get_testids) \
        .unique('uid', 'testid', 'date') \

    buckets = uids.unique('uid') \
        .project('uid', bucket = ne.custom(lambda x: random.randrange(0, 100, 1), 'uid').add_hints(type=int))

    uids.join(buckets, by='uid') \
        .put("$tmp_files" + "/" + table_name + "_0uids")# schema=dict(testid=str, bucket=int, uid=str, date=str)

    return job


if __name__ == '__main__':
    cli.run()

