import json
import os

from sandbox import sdk2
from sandbox.projects.common import file_utils
from sandbox.projects.suggest.dicts import SuggestDictTask
from sandbox.sandboxsdk.environments import PipEnvironment


def normalize(query):
    if isinstance(query, str):
        query = query.decode('utf-8')
    output = ''
    for ch in query.lower():
        if ch.isalpha() or ch.isspace():
            output += ch

    return ' '.join(output.split()).encode('utf-8')


class BuildAliceSkillsDict(sdk2.Task, SuggestDictTask):
    """ Build Alice skills dictionary """

    class Parameters(sdk2.Task.Parameters):
        yt_token_vault_name = sdk2.parameters.String(
            "Name of the vault record with YT token",
            name="vault_token_name",
            default="yt_token"
        )

        input_table_path = sdk2.parameters.String(
            "Input table path",
            default="//home/paskills/suggests/stable"
        )

    class Requirements(sdk2.Requirements):
        environments = [
            PipEnvironment("yandex-yt")
        ]

    def on_execute(self):
        proxy = "hahn.yt.yandex.net"
        token = sdk2.Vault.data(self.Parameters.yt_token_vault_name)
        self.setup_yt_client(proxy, token)

        queries = []
        rich_data = []
        unique_queries = set()

        for row in self.yt_client.read_table(self.Parameters.input_table_path, format=self.yt_format):
            weight = min(10.0, float(row['weight']))

            query = normalize(row['name'])
            meta_data = json.dumps(row['data'], ensure_ascii=False).encode('utf-8')

            if query not in unique_queries and meta_data:
                unique_queries.add(query)
                queries.append(query + '\t\t' + str(weight) + '\n')
                rich_data.append(query + '\t' + meta_data + '\n')

                query_with_keywords = normalize(row['name'] + ' ' + row['keywords'])
                if query_with_keywords in unique_queries:
                    continue

                unique_queries.add(query_with_keywords)
                queries.append(query_with_keywords + '\t' + query + '\t' + str(weight) + '\n')

        queries = sorted(queries)

        queries_path = os.path.join(os.getcwd(), 'queries')
        file_utils.write_lines(queries_path, sorted(queries))

        groups_path = os.path.join(os.getcwd(), 'groups')
        file_utils.write_lines(groups_path, ['\n'])

        streams_path = os.path.join(os.getcwd(), 'streams')
        file_utils.write_lines(streams_path, ['ALL\t{}\n'.format(10.0)])

        data_path = os.path.join(os.getcwd(), 'data')
        file_utils.write_lines(data_path, rich_data)

        dict_path = os.path.join(os.getcwd(), "dict")
        os.makedirs(dict_path)
        dict_prefix = os.path.join(dict_path, "alice")

        self.run_data_builder(dict_prefix,
                              queries_path,
                              groups_path,
                              streams_path,
                              data_path,
                              word_index=True,
                              top_size=100)

        self.publish_dict("alice_skills",
                          "Dictionaries for Alice skills",
                          dict_path,
                          autodeploy=True)
