from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)

from qb2.api.v1 import extractors as se, filters as sf
import argparse
import nile
import datetime
import uatraits, json, re
import urllib, urlparse
import math,cgi
import pandas as pd
from itertools import product
import sys
import os


def parse_oo_data(records):

    for r in records:
        #if 'enw35013712' not in r.key:
        #    continue
        #if "ruw759019" not in r.key:
        #    continue

        value = json.loads(r.value)
        #types
        otype = None
        osubtype = None
        isa = value.get('isa')
        if isa:
            types_list = isa.get('otype', [])

            if types_list:
                otype = types_list[0].get('value')
                osubtype = types_list[0].get('subvalue')

        #genre
        genres_list = value.get('film_genres', [])
        genres_set = set()
        if genres_list:
            genre = genres_list[0].get('original_value', 'other')
            for item in genres_list:
                tmp = item.get('original_value', 'other')
                genres_set.add(tmp)
        else:
            genre = 'other'

        genres = []
        if genres_set:
            genres = list(genres_set)

        #mult or no
        animated = 'no'
        for item in value.get('isa', {}).get('tags', []):
            if 'Animation' in item.get('value', ''):
                animated = 'yes'

        #regiser
        directors_list = value.get('director', [])
        directors_set = set()
        if directors_list:
            first_director = directors_list[0].get('value', 'other')
            for item in directors_list:
                tmp = item.get('value', 'other')
                directors_set.add(tmp)
        else:
            first_director = 'other'

        if '|' in first_director:
            director = (first_director[:len(first_director)-2].split('|')[1]).lower()
        else:
            director = first_director.lower()

        directors = []
        if directors_set:
            for item in directors_set:
                if '|' in item:
                    directors.append((item[:len(item)-2].split('|')[1]).lower())
                else:
                    directors.append(item.lower())

        #producer
        producers_list = value.get('film_producer', [])
        producers_set = set()
        if producers_list:
            first_producer = producers_list[0].get('value', 'other')
            for item in producers_list:
                tmp = item.get('value', 'other')
                producers_set.add(tmp)
        else:
            first_producer = 'other'

        if '|' in first_producer:
            producer = (first_producer[:len(first_producer)-2].split('|')[1]).lower()
        else:
            producer = first_producer.lower()

        producers = []
        if producers_set:
            for item in producers_set:
                if '|' in item:
                    if len(item[:len(item)-2].split('|')) > 1:
                        producers.append((item[:len(item)-2].split('|')[1]).lower())
                    else:
                        producers.append(item.lower())
                else:
                    producers.append(item)

        #scenarist
        writers_list = value.get('writer', [])
        writers_set = set()
        if writers_list:
            first_writer = writers_list[0].get('value', 'other')
            for item in writers_list:
                tmp = item.get('value', 'other')
                writers_set.add(tmp)
        else:
            first_writer = 'other'

        if '|' in first_writer:
            writer = (first_writer[:len(first_writer)-2].split('|')[1]).lower()
        else:
            writer = first_writer.lower()

        writers = []
        if writers_set:
            for item in writers_set:
                if '|' in item:
                    writers.append((item[:len(item)-2].split('|')[1]).lower())
                else:
                    writers.append(item.lower())

        #developer
        developers_list = value.get('developers', [])
        developers_set = set()
        if developers_list:
            first_developer = developers_list[0].get('value', 'other')
            for item in developers_list:
                tmp = item.get('value', 'other')
                developers_set.add(tmp)
        else:
            first_developer = 'other'

        if '|' in first_developer:
            try:
                developer = (first_developer[:len(first_developer)-2].split('|')[1]).lower()
            except:
                developer = first_developer.lower()
        else:
            developer = first_developer.lower()

        developers = []
        if developers_set:
            for item in developers_set:
                if '|' in item:
                    try:
                        developers.append((item[:len(item)-2].split('|')[1]).lower())
                    except:
                        developers.append(item.lower())
                else:
                    developers.append(item.lower())

        #years
        if 'film_release_year' in value:
            years = value.get('film_release_year', [])
            if years:
                year = years[0].get('value', 'no_info')
            else:
                year = 'no_info'
        else:
            years = value.get('film_series_release_year', [])
            if years:
                year = years[0].get('value', 'no_info')
            else:
                year = 'no_info'

        #countries
        countries_list = value.get('countries', [])
        countries_set = set()
        first_country = 'other'
        if countries_list:
            if "formatted" in countries_list[0] and countries_list[0]["formatted"]:
                for form in countries_list[0]["formatted"]:
                    if 'ru' in form["RelevLocale"]:
                        first_country = form.get("value", 'other')
            for item in countries_list:
                if "formatted" in item and item["formatted"]:
                    for form in item["formatted"]:
                        if 'ru' in form["RelevLocale"]:
                            countries_set.add(form.get("value", 'other'))
                else:
                    tmp = item.get('value', 'other')
                    countries_set.add(tmp)

        if '|' in first_country:
            country = (first_country[:len(first_country)-2].split('|')[1]).lower()
        else:
            country = first_country.lower()

        countries = []
        if countries_set:
            for item in countries_set:
                if '|' in item:
                    countries.append((item[:len(item)-2].split('|')[1]).lower())
                else:
                    countries.append(item.lower())

        query = ""
        SearchRequest = value.get('SearchRequest', [])
        if SearchRequest and type(SearchRequest) == list:
            for item in SearchRequest:
                if 'ru' in item.get('RelevLocale', []):
                    query = item.get('value')

        title = ""
        titles = value.get('Title', [])
        if titles and type(titles) == list:
            for item in titles:
                if 'ru' in item.get('RelevLocale', []):
                    title = item.get('value')

        #kp_url
        ids = value.get('ids', [])
        kp_url = None
        if ids and type(ids) == list:
            for item in ids:
                if 'kinopoisk.ru/film' in item.get('value'):
                    kp_url = item.get('value')

        yield Record(onto_id=r.key, genre = genre, genres = genres,  is_animated = animated,
                 director = director, directors = directors, producer = producer,
                 producers = producers, writer = writer, writers = writers,
                 developer = developer, developers = developers, year = year, countries = countries,
                 country = country, query = query, title = title,
                 otype = otype, osubtype = osubtype, kp_url = kp_url
                )


def run_parse():

    cluster = clusters.yt.Hahn(pool='vika-pavlova'
                            ).env(templates=dict(job_root='home/videolog/vika-pavlova/2844-diversity_metrics'
                                                ),
                                  yt_spec_defaults=dict(pool_trees=["physical"],
                                                        #tentative_pool_trees=["cloud"]),
                                                        use_default_tentative_pool_trees = True),
                                  parallel_operations_limit=10
                                 )

    job = cluster.job()

    job.table('home/dict/ontodb/ver/daily/production/all_cards_final'
                ).map(parse_oo_data,memory_limit=4000
                     ).put('$job_root/parse_oo')

    job.run()


def main():

  parser = argparse.ArgumentParser()
  parser.add_argument('--start_date', type=str, required=True)
  parser.add_argument('--end_date', type=str, required=True)
  args = parser.parse_args()

  run_parse()


if __name__ == '__main__':
  main()
