# -*- coding: utf-8 -*-

from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)

from qb2.api.v1 import extractors as se, filters as sf
import argparse
import nile
import datetime
import uatraits, json, re
import urllib, urlparse
import math,cgi
import pandas as pd
from itertools import product
import sys
import os

cluster = clusters.yt.Hahn(pool='vika-pavlova'
    ).env(templates=dict(job_root='home/videolog/vika-pavlova/2406-learn_vs_pool'
                        ),
          yt_spec_defaults=dict(pool_trees=["physical"],
                                tentative_pool_trees=["cloud"]),
          parallel_operations_limit=10
         )

def process_data_for_stat():

    job = cluster.job()

    # l2
    l2 = job.table('//home/videolog/vika-pavlova/2406-learn_vs_pool/toloka_assessments_l2'
                  ).project(ne.all(),
                            fielddate = ne.const(str(datetime.datetime.today()).split(' ')[0]),
                            markup_type = ne.const("L2_answers"),
                            res = ne.custom(lambda x: x['result'], 'assessment_result'),
                            country =  ne.custom(lambda x: x["region"], "judgement_item")
                           )
    l2_total = l2.groupby('fielddate', 'markup_type',
                         ).aggregate(assessments_count = na.count(),
                                     assessments_count_rel = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "REL", 'res')
                                                                                ),
                                     assessments_count_irrel = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "NOT_REL", 'res')
                                                                                )
                                    ).project(ne.all(),
                                              country = ne.const('total')
                                             )
    l2_country = l2.filter(sf.custom(lambda x: x in ['RU', "BY", 'KZ', 'UA', 'UZ'], 'country')
                          ).groupby('fielddate', 'markup_type', 'country'
                                   ).aggregate(assessments_count = na.count(),
                                               assessments_count_rel = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "REL", 'res')
                                                                                ),
                                               assessments_count_irrel = na.count_distinct('hash',
                                                                                           predicate=nf.custom(lambda x: x == "NOT_REL", 'res')
                                                                                          )
                                              )
    l2_other = l2.filter(sf.custom(lambda x: x not in ['RU', "BY", 'KZ', 'UA', 'UZ'], 'country')
                          ).groupby('fielddate', 'markup_type',
                                   ).aggregate(assessments_count = na.count(),
                                               assessments_count_rel = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "REL", 'res')
                                                                                ),
                                               assessments_count_irrel = na.count_distinct('hash',
                                                                                           predicate=nf.custom(lambda x: x == "NOT_REL", 'res')
                                                                                          )
                                              ).project(ne.all(),
                                                        country = ne.const('other')
                                                       )

    #vdpq3
    vpq = job.table('//home/videolog/vika-pavlova/2406-learn_vs_pool/toloka_assessments_vpq3'
                  ).project(ne.all(),
                            fielddate = ne.const(str(datetime.datetime.today()).split(' ')[0]),
                            markup_type = ne.const("vpq3"),
                            res = ne.custom(lambda x: x['result'], 'assessment_result'),
                            country =  ne.custom(lambda x: x["region"], "judgement_item")
                           )
    vpq_total = vpq.groupby('fielddate', 'markup_type',
                           ).aggregate(assessments_count = na.count(),
                                       assessments_count_rel = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "REL", 'res')
                                                                                ),
                                       assessments_count_irrel = na.count_distinct('hash',
                                                                                   predicate=nf.custom(lambda x: x == "NOT_REL", 'res')
                                                                                  )
                                      ).project(ne.all(),
                                                country = ne.const('total')
                                               )
    vpq_country = vpq.filter(sf.custom(lambda x: x in ['RU', "BY", 'KZ', 'UA', 'UZ'], 'country')
                            ).groupby('fielddate', 'markup_type', 'country'
                                     ).aggregate(assessments_count = na.count(),
                                                 assessments_count_rel = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "REL", 'res')
                                                                                ),
                                                 assessments_count_irrel = na.count_distinct('hash',
                                                                                             predicate=nf.custom(lambda x: x == "NOT_REL", 'res')
                                                                                            )
                                                )
    vpq_other = vpq.filter(sf.custom(lambda x: x not in ['RU', "BY", 'KZ', 'UA', 'UZ'], 'country')
                          ).groupby('fielddate', 'markup_type',
                                   ).aggregate(assessments_count = na.count(),
                                               assessments_count_rel = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "REL", 'res')
                                                                                ),
                                               assessments_count_irrel = na.count_distinct('hash',
                                                                                           predicate=nf.custom(lambda x: x == "NOT_REL", 'res')
                                                                                          )
                                              ).project(ne.all(),
                                                        country = ne.const('other')
                                                       )

    #yang
    yang = job.table('//home/videolog/vika-pavlova/2406-learn_vs_pool/yang_assessments'
                  ).project(ne.all(),
                            fielddate = ne.const(str(datetime.datetime.today()).split(' ')[0]),
                            markup_type = ne.const("yang"),
                            res = ne.custom(lambda x: x['relevance'], "assessment_result"),
                            country =  ne.custom(lambda x: x["country"], "judgement_item")
                           )
    yang_total = yang.groupby('fielddate', 'markup_type',
                             ).aggregate(assessments_count = na.count(),
                                         assessments_count_rel_plus = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "RELEVANT_PLUS", 'res')
                                                                                ),
                                         assessments_count_rel_minus = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "RELEVANT_MINUS", 'res')
                                                                                ),
                                         assessments_count_irrel = na.count_distinct('hash',
                                                                                      predicate=nf.custom(lambda x: x == "IRRELEVANT", 'res')
                                                                                     )
                                         ).project(ne.all(),
                                                   country = ne.const('total')
                                                   )
    yang_country = yang.filter(sf.custom(lambda x: x in ['RU', "BY", 'KZ', 'UA', 'UZ'], 'country')
                             ).groupby('fielddate', 'markup_type', 'country'
                                      ).aggregate(assessments_count = na.count(),
                                                  assessments_count_rel_plus = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "RELEVANT_PLUS", 'res')
                                                                                ),
                                                  assessments_count_rel_minus = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "RELEVANT_MINUS", 'res')
                                                                                ),
                                                  assessments_count_irrel = na.count_distinct('hash',
                                                                                               predicate=nf.custom(lambda x: x == "IRRELEVANT", 'res')
                                                                                             )
                                                 )
    yang_other = yang.filter(sf.custom(lambda x: x not in ['RU', "BY", 'KZ', 'UA', 'UZ'], 'country')
                          ).groupby('fielddate', 'markup_type',
                                   ).aggregate(assessments_count = na.count(),
                                               assessments_count_rel_plus = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "RELEVANT_PLUS", 'res')
                                                                                ),
                                               assessments_count_rel_minus = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "RELEVANT_MINUS", 'res')
                                                                                ),
                                               assessments_count_irrel = na.count_distinct('hash',
                                                                                           predicate=nf.custom(lambda x: x == "IRRELEVANT", 'res')
                                                                                          )
                                              ).project(ne.all(),
                                                        country = ne.const('other')
                                                       )

    # bin relevance
    learn = job.table('//home/videolog/vika-pavlova/2406-learn_vs_pool/toloka_assessments_learn'
                  ).project(ne.all(),
                            fielddate = ne.const(str(datetime.datetime.today()).split(' ')[0]),
                            markup_type = ne.const("bin_relevance_learn"),
                            res = ne.custom(lambda x: x.get('result', 'no_result'), 'assessment_result'),
                            country =  ne.custom(lambda x: x.get("region", 'other'), "judgement_item")
                           )
    learn_total = learn.groupby('fielddate', 'markup_type',
                         ).aggregate(assessments_count = na.count(),
                                     assessments_count_rel = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "REL", 'res')
                                                                                ),
                                     assessments_count_irrel = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "NOT_REL", 'res')
                                                                                )
                                    ).project(ne.all(),
                                              country = ne.const('total')
                                             )
    learn_country = learn.filter(sf.custom(lambda x: x in ['RU', "BY", 'KZ', 'UA', 'UZ'], 'country')
                          ).groupby('fielddate', 'markup_type', 'country'
                                   ).aggregate(assessments_count = na.count(),
                                               assessments_count_rel = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "REL", 'res')
                                                                                ),
                                               assessments_count_irrel = na.count_distinct('hash',
                                                                                           predicate=nf.custom(lambda x: x == "NOT_REL", 'res')
                                                                                          )
                                              )
    learn_other = learn.filter(sf.custom(lambda x: x not in ['RU', "BY", 'KZ', 'UA', 'UZ'], 'country')
                          ).groupby('fielddate', 'markup_type',
                                   ).aggregate(assessments_count = na.count(),
                                               assessments_count_rel = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "REL", 'res')
                                                                                ),
                                               assessments_count_irrel = na.count_distinct('hash',
                                                                                           predicate=nf.custom(lambda x: x == "NOT_REL", 'res')
                                                                                          )
                                              ).project(ne.all(),
                                                        country = ne.const('other')
                                                       )

    # categories
    categ = job.table('//home/videolog/vika-pavlova/2406-learn_vs_pool/categories_assessments'
                  ).project(ne.all(),
                            fielddate = ne.const(str(datetime.datetime.today()).split(' ')[0]),
                            markup_type = ne.const("categories_learn"),
                            res = ne.custom(lambda x: x.get('category', 'no_result'), 'assessment_result')
                           )
    categ_total = categ.groupby('fielddate', 'markup_type',
                         ).aggregate(assessments_count = na.count(),
                                     assessments_count_film = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "film", 'res')
                                                                                ),
                                     assessments_count_serial = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "serial", 'res')
                                                                                ),
                                     assessments_count_porno = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "porno", 'res')
                                                                                ),
                                     assessments_count_other = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x != "porno" and x != "film" and x != "serial", 'res')
                                                                                )
                                    ).project(ne.all(),
                                              country = ne.const('total')
                                             )

    # object_type
    object_type = job.table('//home/videolog/vika-pavlova/2406-learn_vs_pool/object_type_assessments'
                  ).project(ne.all(),
                            fielddate = ne.const(str(datetime.datetime.today()).split(' ')[0]),
                            markup_type = ne.const("object_type_learn")
                           )
    object_type_total = object_type.groupby('fielddate', 'markup_type',
                                           ).aggregate(assessments_count = na.count()
                                                      ).project(ne.all(),
                                                                country = ne.const('total')
                                                               )

    # quality
    quality = job.table('//home/videolog/vika-pavlova/2406-learn_vs_pool/quality_assessments'
                  ).project(ne.all(),
                            fielddate = ne.const(str(datetime.datetime.today()).split(' ')[0]),
                            markup_type = ne.const("quality_learn"),
                            res = ne.custom(lambda x: x.get('result', 'no_result'), 'assessment_result'),
                            country =  ne.custom(lambda x: x.get("country", 'other'), "judgement_item")
                           )
    quality_total = quality.groupby('fielddate', 'markup_type',
                         ).aggregate(assessments_count = na.count(),
                                     assessments_count_ok = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "OK", 'res')
                                                                                ),
                                     assessments_count_norm = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "NORM", 'res')
                                                                                ),
                                     assessments_count_bad = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "BAD", 'res')
                                                                                )
                                    ).project(ne.all(),
                                              country = ne.const('total')
                                             )
    quality_country = quality.filter(sf.custom(lambda x: x in ['RU', "BY", 'KZ', 'UA', 'UZ'], 'country')
                          ).groupby('fielddate', 'markup_type', 'country'
                                   ).aggregate(assessments_count = na.count(),
                                               assessments_count_ok = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "OK", 'res')
                                                                                ),
                                                assessments_count_norm = na.count_distinct('hash',
                                                                                            predicate=nf.custom(lambda x: x == "NORM", 'res')
                                                                                            ),
                                                assessments_count_bad = na.count_distinct('hash',
                                                                                            predicate=nf.custom(lambda x: x == "BAD", 'res')
                                                                                            )
                                              )
    quality_other = quality.filter(sf.custom(lambda x: x not in ['RU', "BY", 'KZ', 'UA', 'UZ'], 'country')
                          ).groupby('fielddate', 'markup_type',
                                   ).aggregate(assessments_count = na.count(),
                                               assessments_count_ok = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "OK", 'res')
                                                                                ),
                                                assessments_count_norm = na.count_distinct('hash',
                                                                                            predicate=nf.custom(lambda x: x == "NORM", 'res')
                                                                                            ),
                                                assessments_count_bad = na.count_distinct('hash',
                                                                                            predicate=nf.custom(lambda x: x == "BAD", 'res')
                                                                                            )
                                              ).project(ne.all(),
                                                        country = ne.const('other')
                                                       )

    #VH_CONTENT
    vh_content = job.table('//home/videolog/vika-pavlova/2406-learn_vs_pool/yang_vh_content'
                  ).project(ne.all(),
                            fielddate = ne.const(str(datetime.datetime.today()).split(' ')[0]),
                            markup_type = ne.const("yang_vh_content"),
                            res = ne.custom(lambda x: x['relevance'], "assessment_result"),
                            country =  ne.custom(lambda x: x["country"], "judgement_item")
                           )
    vh_content_total = vh_content.groupby('fielddate', 'markup_type',
                             ).aggregate(assessments_count = na.count(),
                                         assessments_count_rel_plus = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "RELEVANT_PLUS", 'res')
                                                                                ),
                                         assessments_count_rel_minus = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "RELEVANT_MINUS", 'res')
                                                                                ),
                                         assessments_count_irrel = na.count_distinct('hash',
                                                                                      predicate=nf.custom(lambda x: x == "IRRELEVANT", 'res')
                                                                                     )
                                         ).project(ne.all(),
                                                   country = ne.const('total')
                                                   )
    vh_content_country = vh_content.filter(sf.custom(lambda x: x in ['RU', "BY", 'KZ', 'UA', 'UZ'], 'country')
                             ).groupby('fielddate', 'markup_type', 'country'
                                      ).aggregate(assessments_count = na.count(),
                                                  assessments_count_rel_plus = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "RELEVANT_PLUS", 'res')
                                                                                ),
                                                  assessments_count_rel_minus = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "RELEVANT_MINUS", 'res')
                                                                                ),
                                                  assessments_count_irrel = na.count_distinct('hash',
                                                                                               predicate=nf.custom(lambda x: x == "IRRELEVANT", 'res')
                                                                                             )
                                                 )
    vh_content_other = vh_content.filter(sf.custom(lambda x: x not in ['RU', "BY", 'KZ', 'UA', 'UZ'], 'country')
                          ).groupby('fielddate', 'markup_type',
                                   ).aggregate(assessments_count = na.count(),
                                               assessments_count_rel_plus = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "RELEVANT_PLUS", 'res')
                                                                                ),
                                               assessments_count_rel_minus = na.count_distinct('hash',
                                                                                 predicate=nf.custom(lambda x: x == "RELEVANT_MINUS", 'res')
                                                                                ),
                                               assessments_count_irrel = na.count_distinct('hash',
                                                                                           predicate=nf.custom(lambda x: x == "IRRELEVANT", 'res')
                                                                                          )
                                              ).project(ne.all(),
                                                        country = ne.const('other')
                                                       )

    job.concat(l2_total, l2_country, l2_other,
               vpq_total, vpq_country, vpq_other,
               yang_total, yang_country, yang_other,
               learn_total, learn_country, learn_other,
               categ_total,
               quality_total, quality_country, quality_other, object_type_total,
               vh_content_total, vh_content_country, vh_content_other,
              ).project('fielddate', 'markup_type', 'country', 'assessments_count',
                         assessments_count_rel_plus = ne.custom(lambda x: x if x else 0, 'assessments_count_rel_plus'),
                         assessments_count_rel_minus = ne.custom(lambda x: x if x else 0, 'assessments_count_rel_minus'),
                         assessments_count_rel = ne.custom(lambda x: x if x else 0, 'assessments_count_rel'),
                         assessments_count_irrel = ne.custom(lambda x: x if x else 0, 'assessments_count_irrel'),
                         assessments_count_ok = ne.custom(lambda x: x if x else 0, 'assessments_count_ok'),
                         assessments_count_bad = ne.custom(lambda x: x if x else 0, 'assessments_count_bad'),
                         assessments_count_norm = ne.custom(lambda x: x if x else 0, 'assessments_count_norm'),
                         assessments_count_film = ne.custom(lambda x: x if x else 0, 'assessments_count_film'),
                         assessments_count_serial = ne.custom(lambda x: x if x else 0, 'assessments_count_serial'),
                         assessments_count_porno = ne.custom(lambda x: x if x else 0, 'assessments_count_porno'),
                         assessments_count_other = ne.custom(lambda x: x if x else 0, 'assessments_count_other')
                        ).put('//home/videolog/vika-pavlova/2406-learn_vs_pool/results_for_report')

    job.run()


def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--start_date', type=str, required=True)
    parser.add_argument('--end_date', type=str, required=True)
    args = parser.parse_args()

    process_data_for_stat()


if __name__ == '__main__':
    main()
