# -*- coding: utf-8 -*
import os
import datetime
import sys

from sandbox import sdk2

from sandbox.sandboxsdk import environments
from sandbox.sandboxsdk.process import run_process
from sandbox.sandboxsdk.paths import get_logs_folder

from sandbox.projects import resource_types
from sandbox.projects.geosearch import resource_types as geo_types
from sandbox.projects.geobase.Geodata5BinStable import resource as gbr
from sandbox.projects.geosearch.tools import load_index

from sandbox.projects.mapsearch.BuildMapsDatabaseAdvert2 import MapsDatabaseAdvert2
from sandbox.projects.geosearch.tools.database_notifications import NotifyGeosearchDuty


class CalcSimilarOrgs(NotifyGeosearchDuty, sdk2.Task):
    """
        Calculates geosearch similar orgs from hypotheses
    """

    class Requirements(sdk2.Task.Requirements):
        cores = 24
        ram = 80000
        disk_space = 250 * 1024

        environments = (environments.PipEnvironment('yandex-yt'), environments.PipEnvironment('yandex-yt-yson-bindings-skynet'),)

        class Caches(sdk2.Requirements.Caches):
            pass

    class Parameters(sdk2.task.Parameters):
        notify_parameters = NotifyGeosearchDuty.Parameters()
        similar_orgs_calcer = sdk2.parameters.Resource('Similar orgs calcer: ',
                                                       resource_type=resource_types.CALC_SIMILAR_ORGS_EXECUTABLE,
                                                       required=True)
        similar_orgs_hypotheses = sdk2.parameters.Resource('Similar orgs hypotheses: ',
                                                           resource_type=resource_types.SIMILAR_ORGS_HYPOTHESES)
        hypotheses_table = sdk2.parameters.Resource('Hypotheses table resource: ',
                                                    resource_type=resource_types.SIMILAR_ORGS_HYPOTHESES_TABLE)
        hypotheses_table_name = sdk2.parameters.String('Hypotheses table exact name: ', default_value='')
        similar_orgs_model = sdk2.parameters.Resource('Similar orgs model: ',
                                                      resource_type=resource_types.SIMILAR_ORGS_MODEL)
        index_shardmap = sdk2.parameters.Resource('Geobasesearch index shardmap',
                                                  resource_type=resource_types.ADDRS_BUSINESS_SHARDMAP)
        main_rubrics = sdk2.parameters.Resource('Business main rubrics',
                                                resource_type=resource_types.ORGS_MAIN_RUBRICS_BIN)
        max_similar_orgs = sdk2.parameters.Integer('Max similar orgs: ',
                                                   default_value=20,
                                                   required=True)
        polynom_name = sdk2.parameters.String('Polynom name: ',
                                              default_value='')
        threads_num = sdk2.parameters.Integer('Threads num: ')
        end_date = sdk2.parameters.String('User factors date: ',
                                          default_value='')
        geobase = sdk2.parameters.Resource('Geobase: ',
                                           resource_type=gbr.GEODATA5BIN_STABLE,
                                           required=True)
        shuffle_orgs = sdk2.parameters.Bool('Take random top from hypotheses',
                                            default_value=False,
                                            required=True)
        insane_random = sdk2.parameters.Bool('Take random similars from nearby orgs',
                                             default_value=False,
                                             required=True)
        preparat_window = sdk2.parameters.Integer('Preparat window: ',
                                                  default_value=0,
                                                  required=True)
        preparat_path = sdk2.parameters.String('Custom preparat path: ',
                                               default_value=True)
        exp_label = sdk2.parameters.String('Experiment label: ',
                                           default_value='')
        offline_mse_model = sdk2.parameters.Resource('Offline MSE model: ',
                                                     resource_type=resource_types.SIMILAR_ORGS_MODEL)
        offline_class_model = sdk2.parameters.Resource('Offline classification model: ',
                                                       resource_type=resource_types.SIMILAR_ORGS_MODEL)
        wizard_boost = sdk2.parameters.Integer('Boosted orgs top size: ',
                                               default_value=0)
        advert_data = sdk2.parameters.Resource('Advert data for geoproduct boost: ',
                                               resource_type=MapsDatabaseAdvert2)
        advert_only = sdk2.parameters.Bool('Similar orgs from geoproduct only',
                                           default_value=False)
        top_adverts = sdk2.parameters.Integer('Move this count of adverted orgs to the beginning of list',
                                              default_value=0)
        dump_factors = sdk2.parameters.Bool('Dump factor values for each permalink',
                                            default_value=False)
        schematized_output = sdk2.parameters.String('Table path for schematized output: ',
                                                    default_value='')
        freeroom_yt_path = sdk2.parameters.String('Freeroom YT path: ',
                                                   default_value='')
        availability_stats_yt_path = sdk2.parameters.String('Availability_stats YT path: ',
                                                            default_value='')
        output_yt_path = sdk2.parameters.String('Output YT path: ',
                                                default_value='//home/geosearch/similar_orgs',
                                                required=True)
        ttl_days = sdk2.parameters.Integer('Output YT table TTL in days',
                                           default_value=7)
        output_table_name = sdk2.parameters.String('Output table: ', default_value='')
        similar_orgs_merger = sdk2.parameters.Resource('Similar orgs merger: ',
                                                       resource_type=geo_types.SIMILAR_ORGS_MERGER)
        custom_similar_orgs = sdk2.parameters.String('Custom orgs table: ', default_value='')

    def on_execute(self):
        calcer = sdk2.ResourceData(self.Parameters.similar_orgs_calcer).path

        index = './index'
        files = ['companies.pbs', 'factors.pbs', 'rubrics.pbs', 'features.pbs', 'address_storage.mms']
        shard_count = load_index.download_sharded_index(self.Parameters.index_shardmap, index, files)

        result_name = './similar_orgs_list.txt'

        self.yt_token = sdk2.Vault.data('GEOMETA-SEARCH', 'yt-token')
        os.environ['YT_TOKEN'] = self.yt_token
        os.environ['YT_PREFIX'] = '//home/geosearch/'

        output = open(result_name, 'w')

        exec_params = [str(calcer)]
        model = self.Parameters.similar_orgs_model
        if model:
            exec_params.extend(['-m', str(sdk2.ResourceData(model).path)])

        polynom = self.Parameters.polynom_name
        if polynom:
            exec_params.extend(['-p', polynom])

        exec_params.extend(['-t', str(self.Parameters.max_similar_orgs)])
        threads = self.Parameters.threads_num
        if threads:
            exec_params.extend(['--threads', str(threads)])

        end_date = self.Parameters.end_date
        preparat_window = self.Parameters.preparat_window
        if preparat_window:
            exec_params.extend(['--preparat-window', str(preparat_window)])

        preparat_path = self.Parameters.preparat_path
        if preparat_path:
            exec_params.extend(['--preparat-path', preparat_path])

        if not end_date and preparat_window:
            end_date = datetime.datetime.now().strftime('%Y%m%d')

        if end_date:
            exec_params.extend(['--target-date', end_date])

        random_top = self.Parameters.shuffle_orgs
        if random_top:
            exec_params.extend(['--shuffle'])

        if self.Parameters.insane_random:
            exec_params.extend(['--insane-random'])

        if self.Parameters.offline_mse_model:
            exec_params.extend(['--offline-model-mse', str(sdk2.ResourceData(self.Parameters.offline_mse_model).path)])

        if self.Parameters.offline_class_model:
            exec_params.extend(['--offline-model-class', str(sdk2.ResourceData(self.Parameters.offline_class_model).path)])

        wizard_boost = self.Parameters.wizard_boost
        if wizard_boost > 0:
            exec_params.extend(['--wizard-boost-size', str(wizard_boost)])

        if self.Parameters.schematized_output:
            exec_params.extend(['--output-yt', self.Parameters.schematized_output])

        if self.Parameters.freeroom_yt_path:
            exec_params.extend(['--hotel-dates-table', self.Parameters.freeroom_yt_path])

        if self.Parameters.availability_stats_yt_path:
            exec_params.extend(['--hotel-availabilities-table', self.Parameters.availability_stats_yt_path])

        geobase = str(sdk2.ResourceData(self.Parameters.geobase).path)
        exec_params.extend(['--geobase', geobase])

        input_table = self.Parameters.hypotheses_table
        full_tbl_path = ''
        if input_table:
            table_name = open(str(sdk2.ResourceData(input_table).path)).read().strip()
            if table_name.startswith('//'):
                full_tbl_path = table_name
            else:
                full_tbl_path = 'similar_orgs/hypotheses/from_backa/{0}'.format(table_name)
        elif self.Parameters.hypotheses_table_name:
            full_tbl_path = self.Parameters.hypotheses_table_name

        if full_tbl_path:
            exec_params.extend(['--input-table', full_tbl_path])

        if self.Parameters.advert_data:
            advert_path = str(sdk2.ResourceData(self.Parameters.advert_data).path) + "/advert.pb.bin"
            print >> sys.stderr, "Advert path: " + advert_path
            exec_params.extend(['--advert', advert_path])

        if self.Parameters.advert_only:
            exec_params.extend(['--advert-only'])

        top_adverts = self.Parameters.top_adverts
        if top_adverts > 0:
            exec_params.extend(['--top-adverts', str(top_adverts)])

        exec_params.extend(['--shard-count', str(shard_count)])

        exec_params.extend(['--main-rubrics', str(sdk2.ResourceData(self.Parameters.main_rubrics).path)])

        if self.Parameters.dump_factors:
            factors_file = './factors_dump.txt'
            exec_params.extend(['--dump-factors', factors_file])

        exec_params.append(index)

        proc_err = os.path.join(get_logs_folder(), 'process_dump.err')
        if full_tbl_path:
            run_process(exec_params, stdout=output, stderr=open(proc_err, 'w'))
        elif self.Parameters.similar_orgs_hypotheses:
            hypotheses = sdk2.ResourceData(self.Parameters.similar_orgs_hypotheses).path
            run_process(exec_params, stdout=output, stdin=open(hypotheses), stderr=open(proc_err, 'w'))
        else:
            raise Exception('Cannot find hypotheses(either table or resource)')

        if self.Parameters.dump_factors:
            factors_resource = sdk2.Resource[resource_types.OTHER_RESOURCE]
            current_factors_resource = factors_resource(self, 'factors dump', factors_file)
            factors_data = sdk2.ResourceData(current_factors_resource)
            factors_data.ready()

        def get_row(line):
            items = line.split('\t')
            result = {
                "permalink": int(items[0]),
                "Data": items[1][:-1]}
            return result

        output_tbl_name = self.Parameters.output_table_name if self.Parameters.output_table_name else os.path.join(self.Parameters.output_yt_path, 'similar_orgs_{task_id}'.format(task_id=self.id))

        from yt.wrapper import YtClient
        yt_config = {'proxy': {'url': 'hahn.yt.yandex.net'},
                     'token': self.yt_token}
        yt = YtClient(config=yt_config)
        yt.remove(output_tbl_name, force=True)

        attrs = {}
        if self.Parameters.ttl_days:
            ts = datetime.datetime.now()
            ts += datetime.timedelta(days=self.Parameters.ttl_days)
            attrs['expiration_time'] = ts.isoformat()

        yt.create('table', output_tbl_name, attributes=attrs)
        yt.write_table(output_tbl_name, (get_row(line) for line in open(result_name)), raw=False)

        if self.Parameters.similar_orgs_merger and self.Parameters.custom_similar_orgs:
            calcer = sdk2.ResourceData(self.Parameters.similar_orgs_merger).path
            params = [str(calcer), '-m', output_tbl_name, '-s', self.Parameters.custom_similar_orgs, output_tbl_name]
            proc_err = os.path.join(get_logs_folder(), 'process_dump_merger.err')
            run_process(params, stderr=open(proc_err, 'w'))

        yt.run_sort(output_tbl_name, sort_by=["permalink"])

        resource_path = './similar_orgs_table.txt'
        with open(resource_path, 'w') as f:
            f.write(output_tbl_name)

        yt_resource = sdk2.Resource[geo_types.SIMILAR_ORGS_TABLE]
        current_yt_resource = yt_resource(self, 'similar orgs table', resource_path)
        if self.Parameters.exp_label:
            current_yt_resource.exp = self.Parameters.exp_label
        yt_data = sdk2.ResourceData(current_yt_resource)
        yt_data.ready()
