# -*- coding: utf-8 -*-

from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)

from qb2.api.v1 import extractors as se, filters as sf
import argparse
import nile
import datetime, time
import json, re
import urllib, urlparse
import math,cgi
import pandas as pd
from itertools import product
import sys
import os
import copy
import random


cluster = clusters.yt.Hahn(pool='vika-pavlova'
      ).env(templates=dict(job_root='home/videolog/vika-pavlova/2406-learn_vs_pool'
                          ),
            yt_spec_defaults=dict(pool_trees=["physical"],
                                  tentative_pool_trees=["cloud"]),
            parallel_operations_limit=10
           )

class gather_data():

    def __init__(self, carousel_type, good_cold_hp, bad_cold_hp):
        self.good_cold_hp = good_cold_hp
        self.bad_cold_hp = bad_cold_hp
        self.carousel_type = carousel_type

    def __call__(self, groups):

        #собираем серповые рекомендации
        for key, recs in groups:

            recs_set = set()
            toloka_data = []

            has_recoms = 0
            has_cold_recoms = 0
            has_dislike = 0
            has_bad_res = 0
            has_pers_hp = 0

            for rec in recs:

                if rec['content_type'] == 0 and rec['components']:
                    for recom in rec["components"][:10]:
                        if recom["onto_id"] not in recs_set:
                            recs_set.add(recom["onto_id"])
                            toloka_data.append({
                                "inputValues": {
                                    "embed_url": recom["componentUrl"]["pageUrl"],
                                    "title": recom["text.title"],
                                    "thumbnail": recom['thumbadd']["urls"][0],
                                    "description": recom["text.snippet"],
                                    "onto_id": recom["onto_id"]
                            }
                        })
                    has_recoms = 1

                #добавляем дизлайки как ханипоты
                elif rec['content_type'] == 2:

                    if has_pers_hp < 2 and rec['onto_id'] not in recs_set:
                        recs_set.add(rec['onto_id'])
                        toloka_data.append({
                            "inputValues": {
                                "title": rec["title"],
                                "thumbnail": rec["thumbnail"],
                                "description": rec["description"],
                                "embed_url": rec["embed_url"],
                                "onto_id": rec['onto_id']
                            },
                            "knownSolutions": [
                                {
                                    "weight": 1,
                                    "outputValues": {
                                        "label": "-2"
                                    }
                                },
                                {
                                    "weight": 1,
                                    "outputValues": {
                                        "label": "-1"
                                    }
                                },
                                {
                                    "weight": 0.5,
                                    "outputValues": {
                                        "label": '0'
                                    }
                                }
                            ]
                        })
                        has_pers_hp += 1
                        has_dislike += 1

                #добавляем предыдущие результаты как ханипоты
                elif rec['content_type'] == 3:
                    if has_pers_hp < 2 and rec["onto_id"] not in recs_set:
                        recs_set.add(rec["onto_id"])
                        toloka_data.append({
                            "inputValues": {
                                "title": rec["title"],
                                "thumbnail": rec["thumbnail"],
                                "description": rec["description"],
                                "embed_url": rec["embed_url"],
                                "onto_id": rec["onto_id"]
                            },
                            "knownSolutions": [
                                {
                                    "weight": 1,
                                    "outputValues": {
                                        "label": "-2"
                                    }
                                },
                                {
                                    "weight": 1,
                                    "outputValues": {
                                        "label": "-1"
                                    }
                                },
                                {
                                    "weight": 0.5,
                                    "outputValues": {
                                        "label": '0'
                                    }
                                }
                            ]
                        })
                        has_pers_hp += 1
                        has_bad_res += 1

                #добавляем холодный старт
                elif rec['content_type'] == 1:
                    for recom in rec["components"][:10]:
                        if recom["onto_id"] not in recs_set:
                            recs_set.add(recom["onto_id"])
                            toloka_data.append({
                                "inputValues": {
                                    "embed_url": recom["componentUrl"]["pageUrl"],
                                    "title": recom["text.title"],
                                    "thumbnail": recom['thumbadd']["urls"][0],
                                    "description": recom["text.snippet"],
                                    "onto_id": recom["onto_id"]
                            }
                        })
                    has_cold_recoms = 1

            # добавляем холодные ханипоты
            has_good_cold = 0
            has_bad_cold = 0

            while has_good_cold < 2:
                random_good = random.choice(self.good_cold_hp)
                if random_good['onto_id'] not in recs_set:
                    recs_set.add(random_good['onto_id'])
                    toloka_data.append({
                        "inputValues": {
                            "title": random_good["title"],
                            "thumbnail": random_good["thumbnail"],
                                "description": random_good["description"],
                                "embed_url": random_good["embed_url"],
                                "onto_id": random_good["onto_id"]
                            },
                            "knownSolutions": [
                                {
                                    "weight": 1,
                                    "outputValues": {
                                        "label": "+2"
                                    }
                                }
                            ]
                        })
                    has_good_cold += 1

            while has_bad_cold < 2:
                random_bad = random.choice(self.bad_cold_hp)
                if random_bad['onto_id'] not in recs_set:
                    recs_set.add(random_bad['onto_id'])
                    toloka_data.append({
                        "inputValues": {
                            "title": random_bad["title"],
                            "thumbnail": random_bad["thumbnail"],
                                "description": random_bad["description"],
                                "embed_url": random_bad["embed_url"],
                                "onto_id": random_bad["onto_id"]
                            },
                            "knownSolutions": [
                                {
                                    "weight": 1,
                                    "outputValues": {
                                        "label": "-2"
                                    }
                                }
                            ]
                        })
                    has_bad_cold += 1

            yield Record(puid = key['puid'], workerId = key['workerId'],
                        has_recoms = has_recoms, has_cold_recoms = has_cold_recoms,
                        toloka_data = toloka_data,
                        has_pers_hp = has_pers_hp, has_dislike = has_dislike, has_bad_res = has_bad_res,
                        has_bad_cold = has_bad_cold,
                        has_good_cold = has_good_cold,
                        recs_set = list(recs_set)
                        )


def process_data_for_toloka(carousel_type, good_cold_hp, bad_cold_hp, date):

    print 'process_data_for_toloka'

    job = cluster.job()

    tolokers = job.table('//home/videolog/vika-pavlova/video_recommendations/current_tolokers')

    recoms = job.table('//home/videolog/vika-pavlova/video_recommendations/offline_recomms/recoms/' + carousel_type + '/' + date
                       ).project(ne.all(),
                               content_type = ne.const(0)
                              )

    cold_recoms = job.table('//home/videolog/vika-pavlova/video_recommendations/offline_recomms/cold_recoms/' + carousel_type + '/' + date
                       ).project(ne.all(exclude = 'puid'),
                                 content_type = ne.const(1),
                                 puid = ne.custom(lambda x: str(x), 'puid')
                              )

    dislikes = job.table('//home/videolog/vika-pavlova/video_recommendations/offline_recomms/dislikes/' + carousel_type
                       ).project(ne.all(exclude = 'puid'),
                                 content_type = ne.const(2),
                                 puid = ne.custom(lambda x: str(x), 'puid')
                              )

    previous_results = job.table('//home/videolog/vika-pavlova/video_recommendations/offline_recomms/previous_results/' + carousel_type
                            ).project(ne.all(exclude = 'puid'),
                                      content_type = ne.const(3),
                                      puid = ne.custom(lambda x: str(x), 'puid')
                                     )

    total = job.concat(recoms, cold_recoms, dislikes, previous_results)

    pre_final = total.groupby('puid', 'workerId'
                             ).sort('content_type'
                                   ).reduce(gather_data(carousel_type, good_cold_hp, bad_cold_hp)
                                           )
    pre_final.filter(sf.custom(lambda x, y: x == 1 and y == 1, 'has_recoms', 'has_cold_recoms')
                     ).join(tolokers, by = ('puid', 'workerId')
                           ).put('//home/videolog/vika-pavlova/video_recommendations/offline_recomms/data_for_toloka/' + carousel_type + '/' + date)

    job.run()


def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--carousel_type', type=str, required=True)
    parser.add_argument('--date', type=str, required=True)
    args = parser.parse_args()

    bad_cold_hp = []
    for rec in cluster.read('//home/videolog/vika-pavlova/video_recommendations/offline_recomms/new_cold_honeypots/' + args.carousel_type + '/bad_cold'):
        bad_cold_hp.append({"onto_id": rec["onto_id"],
                            "title": rec["title"],
                            "thumbnail": rec["thumbnail"],
                            "description": rec["description"],
                            "embed_url": rec["embed_url"]
                            })

    good_cold_hp = []
    for rec in cluster.read('//home/videolog/vika-pavlova/video_recommendations/offline_recomms/new_cold_honeypots/' + args.carousel_type + '/good_cold'):
        good_cold_hp.append({"onto_id": rec["onto_id"],
                             "title": rec["title"],
                             "thumbnail": rec["thumbnail"],
                             "description": rec["description"],
                             "embed_url": rec["embed_url"]
                            })

    process_data_for_toloka(args.carousel_type, good_cold_hp, bad_cold_hp, args.date)


if __name__ == '__main__':
    main()
