from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)

from qb2.api.v1 import extractors as se, filters as sf
import argparse
import nile
import datetime
import uatraits, json, re
import urllib, urlparse
import math,cgi
import pandas as pd
from itertools import product
import sys
import os

def totals(recs):

    for rec in recs:
        fielddate = str(datetime.datetime.today()).split()[0]
        yield Record(tag = rec.new_tag, fielddate = fielddate, rel_count = rec.rel,
                     irrel_count = rec.irrel, mist_count = rec._404, total = rec.total
                    )
        yield Record(tag = '_total_', fielddate = fielddate, rel_count = rec.rel,
                     irrel_count = rec.irrel, mist_count = rec._404, total = rec.total
                    )


def process_data_for_stat():

  cluster = clusters.yt.Hahn(pool='vika-pavlova'
    ).env(templates=dict(job_root='home/videolog/vika-pavlova/2406-learn_vs_pool'),
          yt_spec_defaults=dict(pool_trees=["physical"],
                                tentative_pool_trees=["cloud"]),
          parallel_operations_limit=10
         )

  job = cluster.job()

  raw = job.table('//home/videolog/vika-pavlova/2406-learn_vs_pool/toloka_assessments'
                         ).project(ne.all(),
                                   result = ne.custom(lambda x: x.get('result', 'none') if 'result' in x else x.get('relevance', 'none'), 'assessment_result')
                                  )

  porno = raw.filter(sf.and_(sf.contains('tag', 'PORNO'),
                             sf.not_(sf.contains('tag', 'GOOD')),
                             sf.not_(sf.contains('tag', 'VIDEOQUICK'))
                            )
                    ).project(ne.all(),
                              new_tag = ne.const('PORNO')
                             ).groupby('new_tag'
                                      ).aggregate(rel = na.count_distinct('hash',
                                                                          predicate=nf.custom(lambda x: x == 'REL', 'result')
                                                                         ),
                                                  irrel = na.count_distinct('hash',
                                                                          predicate=nf.custom(lambda x: x == 'NOT_REL', 'result')
                                                                         ),
                                                  _404 = na.count_distinct('hash',
                                                                          predicate=nf.custom(lambda x: x == '404', 'result')
                                                                         ),
                                                  other = na.count_distinct('hash',
                                                                          predicate=nf.custom(lambda x: x not in ['NOT_REL', 'REL', '404'], 'result')
                                                                         ),
                                                  total = na.count_distinct('hash',
                                                                           predicate=nf.custom(lambda x: x != 'none', 'result')
                                                                           )
                                                 ).put('$job_root/porno_stat')

  good = raw.filter(sf.and_(sf.contains('tag', 'PORNO'),
                            sf.contains('tag', 'GOOD'),
                            sf.not_(sf.contains('tag', 'VIDEOQUICK'))
                           )
                    ).project(ne.all(),
                              new_tag = ne.const('PORNO,GOOD')
                             ).groupby('new_tag'
                                      ).aggregate(rel = na.count_distinct('hash',
                                                                          predicate=nf.custom(lambda x: x == 'REL', 'result')
                                                                         ),
                                                  irrel = na.count_distinct('hash',
                                                                          predicate=nf.custom(lambda x: x == 'NOT_REL', 'result')
                                                                         ),
                                                  _404 = na.count_distinct('hash',
                                                                          predicate=nf.custom(lambda x: x == '404', 'result')
                                                                         ),
                                                  other = na.count_distinct('hash',
                                                                          predicate=nf.custom(lambda x: x not in ['NOT_REL', 'REL', '404'], 'result')
                                                                         ),
                                                  total = na.count_distinct('hash',
                                                                           predicate=nf.custom(lambda x: x != 'none', 'result')
                                                                           )
                                                 ).put('$job_root/good_stat')

  videoquick = raw.filter(sf.and_(sf.contains('tag', 'PORNO'),
                                  sf.not_(sf.contains('tag', 'GOOD')),
                                  sf.contains('tag', 'VIDEOQUICK')
                                 )
                         ).project(ne.all(),
                                   new_tag = ne.const('PORNO,VIDEOQUICK')
                                  ).groupby('new_tag'
                                           ).aggregate(rel = na.count_distinct('hash',
                                                                               predicate=nf.custom(lambda x: x == 'REL', 'result')
                                                                              ),
                                                       irrel = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == 'NOT_REL', 'result')
                                                                                ),
                                                       _404 = na.count_distinct('hash',
                                                                                predicate=nf.custom(lambda x: x == '404', 'result')
                                                                               ),
                                                       other = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x not in ['NOT_REL', 'REL', '404'], 'result')
                                                                                ),
                                                       total = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x != 'none', 'result')
                                                                                )
                                                      ).put('$job_root/videoquick_stat')

  L2_answers = raw.filter(sf.contains('tag', 'L2_answers')
                         ).project(ne.all(),
                                   new_tag = ne.const('L2_answers')
                                  ).groupby('new_tag'
                                           ).aggregate(rel = na.count_distinct('hash',
                                                                               predicate=nf.custom(lambda x: x == 'REL', 'result')
                                                                              ),
                                                       irrel = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == 'NOT_REL', 'result')
                                                                                ),
                                                       _404 = na.count_distinct('hash',
                                                                                predicate=nf.custom(lambda x: x == '404', 'result')
                                                                               ),
                                                       other = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x not in ['NOT_REL', 'REL', '404'], 'result')
                                                                                ),
                                                       total = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x != 'none', 'result')
                                                                                )
                                           ).put('$job_root/L2_answers_stat')

  other = raw.filter(sf.and_(sf.not_(sf.contains('tag', 'PORNO')),
                             sf.not_(sf.contains('tag', 'L2_answers'))
                            )
                    ).project(ne.all(),
                              new_tag = ne.const('OTHER_LEARN')
                             ).groupby('new_tag'
                                      ).aggregate(rel = na.count_distinct('hash',
                                                                          predicate=nf.custom(lambda x: x == 'REL', 'result')
                                                                         ),
                                                  irrel = na.count_distinct('hash',
                                                                          predicate=nf.custom(lambda x: x == 'NOT_REL', 'result')
                                                                         ),
                                                  _404 = na.count_distinct('hash',
                                                                          predicate=nf.custom(lambda x: x == '404', 'result')
                                                                         ),
                                                  other = na.count_distinct('hash',
                                                                          predicate=nf.custom(lambda x: x not in ['NOT_REL', 'REL', '404'], 'result')
                                                                         ),
                                                  total = na.count_distinct('hash',
                                                                           predicate=nf.custom(lambda x: x != 'none', 'result')
                                                                           )
                                                 ).put('$job_root/other_stat')

  pre_final = job.concat(porno, good, videoquick, other, L2_answers
                        )

  pre_final.map(totals, memory_limit=4000
               ).groupby('fielddate', 'tag'
                        ).aggregate(rel_count = na.sum('rel_count'),
                                    irrel_count = na.sum('irrel_count'),
                                    mist_count = na.sum('mist_count'),
                                    total = na.sum('total')
                                   ).put('$job_root/final_for_stat')

  job.run()


def put_data_to_stat():

    client = ns.StatfaceClient(
        proxy = 'upload.stat.yandex-team.ru',
        token = os.environ['STAT_TOKEN']
    )
    ns.StatfaceReport().path('Video.All/bin_relevance_report') \
                       .scale('daily') \
                       .client(client) \
                       .remote_publish(proxy='hahn',
                                       table_path='//home/videolog/vika-pavlova/2406-learn_vs_pool/final_for_stat',
                                       async_mode=False,
                                       upload_config=False)


def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--start_date', type=str, required=True)
    parser.add_argument('--end_date', type=str, required=True)
    args = parser.parse_args()

    for date in pd.date_range(start=args.start_date, end=args.end_date):
        date_str = str(date)[:10]
        process_data_for_stat()
        put_data_to_stat()


if __name__ == '__main__':
    main()

