#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import sys
import os
import codecs
import argparse
from nile.api.v1 import (
    Record,
    clusters,
    aggregators as na,
    filters as nf,
    extractors as ne,
    statface as ns
)
import nile.files as nfi
from pytils import get_host
import json


CANOURL = 'page_url_cano'


def joined_and_totalize(groups):
    for key, records in groups:
        result = {'host': key.host}
        for rec in records:
            result[
                ('joined' if rec.joined else 'not_joined')
            ] = rec.count
        yield Record(**result)
        result['host'] = '_total_'
        yield Record(**result)


def safe_share(x, y):
    x = x or 0
    y = y or 0
    if not x and not y:
        return 0
    else:
        return x / float(x + y)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--input1')
    parser.add_argument('--input2')
    parser.add_argument('--report')
    parser.add_argument('--job_root', default='//home/videolog/mma-1574')
    parser.add_argument('--date')
    parser.add_argument('--pool')
    args = parser.parse_args()

    report_path = args.report
    job_root = args.job_root

    date = args.date.strip()
    kwargs = {'token': os.environ['YT_TOKEN']}
    if args.pool:
        kwargs['pool'] = args.pool

    input1 = json.load(open(args.input1))['table']
    input2 = json.load(open(args.input2))['table']

    cluster = clusters.yt.Hahn(**kwargs)

    report_table = '{}/reports/{}'.format(job_root, date)
    debug_table = '{}/daily/{}_debug'.format(job_root, date)

    job = cluster.job()

    canourls = job.table(input2).unique('page_url_cano')

    job.table(
        input1
    ).unique(
        'pool_url_cano'
    ).join(
        canourls, type='left', by_left='pool_url_cano',
        by_right='page_url_cano'
    ).sort(
        'page_url_cano'
    ).put(
        debug_table
    ).project(
        'pool_url_cano', host=ne.custom(
            lambda x: get_host(x), 'pool_url_cano'
        ),
        joined=ne.custom(bool, 'page_url_cano'),
        files=[nfi.LocalFile('pytils.py')]
    ).groupby(
        'host', 'joined'
    ).aggregate(
        count=na.count()
    ).groupby(
        'host'
    ).reduce(
        joined_and_totalize
    ).groupby(
        'host'
    ).aggregate(
        joined=na.sum('joined'), not_joined=na.sum('not_joined')
    ).project(
        ne.all(),
        fielddate=ne.const(date),
        joined_share=ne.custom(
            safe_share, 'joined', 'not_joined'
        ),
        total=ne.custom(
            lambda x, y: (x or 0) + (y or 0), 'joined', 'not_joined'
        )
    ).put(
        report_table
    )

    job.run()

    if args.report:
        client = ns.StatfaceClient(
            proxy='upload.stat.yandex-team.ru',
            username=os.environ['STAT_LOGIN'],
            password=os.environ['STAT_PASSWORD']
        )

        ns.StatfaceReport().path(
            report_path
        ).scale('daily').replace_mask(
            'fielddate'
        ).client(
            client
        ).data(
            cluster.read(report_table)
        ).publish()
        print('Pushed to {}'.format(report_path))


if __name__ == "__main__":
    main()
