#!/usr/bin/env python
# coding: utf-8

import sys
reload(sys)
sys.setdefaultencoding("utf-8")

import os
from datetime import datetime
import yt.wrapper as yt
import itertools
from scipy import spatial
from collections import defaultdict

def split_comm(data, comm_markup):
    comm_dict = { x['query_text']: x['image_query_commercial'] for x in comm_markup }
    out_comm = []
    out_not_comm = []
    for elem in data:
        if comm_dict.get(elem['query_text'], "NOT_COMMERCIAL") == "COMMERCIAL":
            elem['other']['toloka_commercial'] = "COMMERCIAL"
            out_comm.append(elem)
        else:
            elem['other']['toloka_commercial'] = "NOT_COMMERCIAL"
            out_not_comm.append(elem)
    return out_comm, out_not_comm


def split_merged_by_comm(job_root, params):
    country, platform, basket_type = params
    in_table = '{job_root}/{country}/{platform}/06_{basket_type}_merged_parts_query_embeds_cleared_dups_1'.format(
        job_root=job_root,
        country=country,
        platform=platform,
        basket_type=basket_type
    )
    comm_table = '{job_root}/all_parts_commercial_markup'.format(
        job_root=job_root
    )
    out_table_comm = '{job_root}/{country}/{platform}/07_{basket_type}_merged_comm'.format(
        job_root=job_root,
        country=country,
        platform=platform,
        basket_type=basket_type
    )
    out_table_not_comm = '{job_root}/{country}/{platform}/07_{basket_type}_merged_not_comm'.format(
        job_root=job_root,
        country=country,
        platform=platform,
        basket_type=basket_type
    )
    data = list(yt.read_table(in_table))
    comm_markup = yt.read_table(comm_table)

    print datetime.now(), 'start ({}, {}, {}), {} queries'.format(country, platform, basket_type, len(data))
    out_data_comm, out_data_not_comm = split_comm(data, comm_markup)
    print datetime.now(), 'done'

    yt.write_table(out_table_comm, out_data_comm)
    yt.write_table(out_table_not_comm, out_data_not_comm)

    return [out_table_comm, out_table_not_comm]


def main(*args):
    queries_list, in2, in3, token, embed_key, html_file = args

    job_root='//home/images/dev/nerevar/baskets_img/2018Q1_v2'

    tables_list = []
    for tup in itertools.product(
        ['BY'], # 'RU', 'UA', 'KZ', 'UZ', 'exUSSR'
        ['desktop'], # 'touch'
        ['kpi'], # 'validate'
    ):
        table_names = split_merged_by_comm(job_root, tup)

        tables_list += table_names
    return tables_list
