# -*- coding: utf-8 -*-

import os
import shutil

from sandbox.sandboxsdk.parameters import LastReleasedResource
from sandbox.sandboxsdk.parameters import ResourceSelector
from sandbox.sandboxsdk.parameters import SandboxBoolParameter
from sandbox.sandboxsdk.parameters import SandboxIntegerParameter
from sandbox.sandboxsdk.paths import copy_path
from sandbox.sandboxsdk.paths import get_logs_folder
from sandbox.sandboxsdk.paths import list_dir
from sandbox.sandboxsdk.paths import make_folder
from sandbox.sandboxsdk.process import run_process
from sandbox.sandboxsdk.task import SandboxTask

from sandbox.projects import resource_types
from sandbox.projects.common import utils
from sandbox.projects.common.geosearch.indexer_config import IndexerConfig
from sandbox.projects.common.geosearch.indexing import BuildHelper
from sandbox.projects.common.geosearch.utils import prepare_geo_user_factors, unpack_files, unpack_file
from sandbox.projects.common.nanny.nanny import ReleaseToNannyTask
from sandbox.projects.BuildMapsStaticFactorsDownloader import MapsStaticFactors
from sandbox.projects.geobase.Geodata4BinStable.resource import GEODATA4BIN_STABLE

INDEXING_OPTIONS_GROUP = 'Indexing Options'
DATA_SOURCES_GROUP = 'Data Sources'

OLD_WIKI_EXCLUDE_REGIONS = [
    225,  # Russia
    187,  # Ukraine
    149,  # Belarus
    159,  # Kazakhstan
    169,  # Georgia
    167,  # Azerbaijan
    171,  # Uzbekistan
    168,  # Armenia
    170,  # Turkmenistan
    29386,  # Abkhazia
    207,  # Kyrgyz Republic
    209,  # Tajikistan
    208,  # Moldova
    124,  # France
]

POISRC_FILES = [
    ('cis1', 'maps:'),
    ('cis2', 'maps:'),
    ('eu1', 'maps:'),
    ('aao', 'maps:'),
    ('tr', 'maps:'),
    ('eu2', 'poisrc:eu2:'),
    ('na', 'poisrc:na:'),
    ('saa', 'poisrc:saa:'),
    # deprecated
    ('russia', 'maps:'),
    ('and_mpro', 'poisrc:and_mpro:'),
    ('turkey_mpro', 'poisrc:turkey_mpro:'),
]


class IndexerParameter(LastReleasedResource):
    name = 'indexer'
    description = 'Indexer executable'
    resource_type = resource_types.GEOPOIINDEXER_EXECUTABLE
    group = INDEXING_OPTIONS_GROUP
    required = True


class NoGeminiParameter(SandboxBoolParameter):
    name = 'no_gemini'
    description = 'Do not use Gemini'
    default_value = False
    group = INDEXING_OPTIONS_GROUP


class NoTextArchiveParameter(SandboxBoolParameter):
    name = 'no_text_archive'
    description = 'Do not create text archive (indexarc, indexdir)'
    default_value = False
    group = INDEXING_OPTIONS_GROUP


class EnrichWithWikiNamesParameter(SandboxBoolParameter):
    name = 'enrich_with_wiki_names'
    description = 'Enrich POIs with names from Wiki'
    default_value = False
    group = INDEXING_OPTIONS_GROUP


class TakeOnlyStopsParameter(SandboxBoolParameter):
    name = 'take_only_stops'
    description = 'Take only stops and other mass transit objects from POISRC'
    default_value = False
    group = INDEXING_OPTIONS_GROUP


class PoisrcSourceParameter(ResourceSelector):
    name = 'poisrc_source'
    description = 'POISRC directory with gzipped files (Russia)'
    resource_type = resource_types.MAPS_DATABASE_POI_SOURCE_POISRC
    group = DATA_SOURCES_GROUP


class WikiSourceParameter(ResourceSelector):
    name = 'wiki_source'
    description = 'Old Wiki export.tar.gz'
    resource_type = resource_types.MAPS_DATABASE_POI_SOURCE_WIKI_ARCHIVE
    group = DATA_SOURCES_GROUP


class AuxUrlsParameter(ResourceSelector):
    name = 'aux_urls'
    description = 'Auxiliary URLs'
    resource_type = resource_types.MAPS_DATABASE_POI_SOURCE_URLS
    group = DATA_SOURCES_GROUP


class ExtractUrlsParameter(ResourceSelector):
    name = 'extracturls_resource_id'
    description = 'Extracturls executable: '
    resource_type = resource_types.MAPS_RELEVANT_URLS_EXTRACTOR_EXECUTABLE
    group = INDEXING_OPTIONS_GROUP


class GeobaseSnapshotParameter(LastReleasedResource):
    name = 'geobase_snapshot'
    description = 'Geobase 4 snapshot'
    resource_type = GEODATA4BIN_STABLE
    group = DATA_SOURCES_GROUP


class GeoUserFactorsParameter(ResourceSelector):
    name = 'geo_user_factors'
    description = 'Geo user factors'
    resource_type = resource_types.MAPS_GEO_USER_FACTORS
    group = DATA_SOURCES_GROUP


class GeoEventsDistanceFactorsParameter(ResourceSelector):
    name = 'geo_events_distance_factors'
    description = 'Geo events distance factors'
    resource_type = resource_types.MAPS_STATIC_EVENTS_DISTANCE_FACTORS
    group = DATA_SOURCES_GROUP


class WebUserFactorsParameter(ResourceSelector):
    name = 'web_user_factors'
    description = 'Web user factors'
    resource_type = resource_types.MAPS_WEB_USER_FACTORS
    group = DATA_SOURCES_GROUP


class WebIndexAnnParameter(ResourceSelector):
    name = 'web_indexann'
    description = 'Web indexann'
    resource_type = resource_types.MAPS_WEB_INDEXANN
    group = DATA_SOURCES_GROUP


class GeoStatParameter(ResourceSelector):
    name = 'geo_stat'
    description = 'geo.stat file'
    resource_type = resource_types.GEO_STAT
    group = DATA_SOURCES_GROUP


class QueryRecDataParameter(ResourceSelector):
    name = 'queryrec_data'
    description = 'Data for query recognizer'
    resource_type = resource_types.MAPS_QUERYREC_DATA
    group = DATA_SOURCES_GROUP


class FactorAnnParameter(ResourceSelector):
    name = 'factor_annotations'
    description = 'File with factor annotations'
    resource_type = resource_types.MAPS_INDEXANN_HOST_DATA
    group = DATA_SOURCES_GROUP


class FilteringAnnParameter(ResourceSelector):
    name = 'filtration_annotations'
    description = 'File with filtration annotations'
    resource_type = resource_types.MAPS_ANNOTATIONS_TSV
    group = DATA_SOURCES_GROUP


class FactorAnnThreadsParameter(SandboxIntegerParameter):
    name = 'factor_annotations_indexer_threads'
    description = 'Number of worker threads for factor annotation indexer'
    group = INDEXING_OPTIONS_GROUP
    default_value = 6


class StaticFactorsMapParameter(ResourceSelector):
    name = 'static_factors_map'
    description = 'File static factors map from yt'
    resource_type = MapsStaticFactors
    group = DATA_SOURCES_GROUP


class BuildMapsDatabasePoi(ReleaseToNannyTask, SandboxTask):
    """
        Build geosearch POI index
    """
    type = 'BUILD_MAPS_DATABASE_POI'

    input_parameters = (
        IndexerParameter, NoGeminiParameter, NoTextArchiveParameter, EnrichWithWikiNamesParameter, TakeOnlyStopsParameter,
        PoisrcSourceParameter, WikiSourceParameter, AuxUrlsParameter,
        GeobaseSnapshotParameter, GeoUserFactorsParameter, GeoEventsDistanceFactorsParameter, WebUserFactorsParameter, WebIndexAnnParameter,
        GeoStatParameter, ExtractUrlsParameter,
        QueryRecDataParameter, FactorAnnParameter, StaticFactorsMapParameter, FactorAnnThreadsParameter, FilteringAnnParameter
    )

    def _get_unpacked_source(self, parameter, dir):
        arch_path = self.sync_resource(
            utils.get_or_default(self.ctx, parameter))
        unpack_dir = make_folder(
            os.path.join(self.abs_path(), dir))
        unpack_file(arch_path, unpack_dir)
        unpack_files(unpack_dir)
        return unpack_dir

    def on_execute(self):
        # create empty index directory
        index_dir = 'index'
        make_folder(index_dir)

        # copy web indexann
        resource_id = utils.get_or_default(self.ctx, WebIndexAnnParameter)
        if resource_id:
            web_indexann_dir = self.sync_resource(resource_id)
            for file in list_dir(web_indexann_dir, abs_path=True):
                copy_path(file, index_dir)

        # make config
        config = IndexerConfig()
        config.options['threads'] = 6
        config.options['no_gemini'] = utils.get_or_default(self.ctx, NoGeminiParameter)
        config.options['no_text_archive'] = utils.get_or_default(self.ctx, NoTextArchiveParameter)
        config.options['enrich_with_wiki_names'] = utils.get_or_default(self.ctx, EnrichWithWikiNamesParameter)
        config.directories['index'] = index_dir

        # prepare data sources:

        # * POISRC
        resource_id = utils.get_or_default(self.ctx, PoisrcSourceParameter)
        if resource_id:
            poisrc_dir = self.sync_resource(resource_id)
            for region, prefix in POISRC_FILES:
                xml = os.path.join(poisrc_dir, 'poisrc_{}.xml.gz'.format(region))
                if not os.path.isfile(xml):
                    continue
                opts = {
                    'type': 'poi',
                    'files': xml,
                    'id_prefix': prefix,
                }
                if utils.get_or_default(self.ctx, TakeOnlyStopsParameter):
                    opts['id_sources'] = 'mtr'

                config.sources.append(opts)

        # * old wiki export (in tar.gz)
        resource_id = utils.get_or_default(self.ctx, WikiSourceParameter)
        if resource_id:
            export_tar_gz = self.sync_resource(resource_id)
            wiki_dir = 'wiki'
            make_folder(wiki_dir)
            run_process(['tar', '--extract', '--file', export_tar_gz, '--directory', wiki_dir, '--verbose'], log_prefix='unpack_export')

            config.sources.append({
                'type': 'wiki',
                'files': os.path.join(wiki_dir, 'search.objects*.xml'),
                'excluded_regions': ','.join((str(region) for region in OLD_WIKI_EXCLUDE_REGIONS))
            })
            config.files['wiki_categories'] = os.path.join(wiki_dir, 'categories.xml')

        # * geobase
        resource_id = utils.get_or_default(self.ctx, GeobaseSnapshotParameter)
        if resource_id:
            geobase_snapshot_tar_gz = self.sync_resource(resource_id)
            geobase_dir = 'geobase'
            make_folder(geobase_dir)
            run_process(['tar', '--extract', '--file', geobase_snapshot_tar_gz, '--directory', geobase_dir, '--verbose'], log_prefix='unpack_geobase')
            config.files['geobase_geodata'] = os.path.join(geobase_dir, 'geodata4.bin')

        # * geo user factors
        resource_id = utils.get_or_default(self.ctx, GeoUserFactorsParameter)
        if resource_id:
            geo_user_factors_dir = prepare_geo_user_factors(self, resource_id, 'wiki')
            config.files['static_user_factors'] = os.path.join(geo_user_factors_dir, 'wiki_clicks_static')
            config.files['dynamic_user_factors'] = os.path.join(geo_user_factors_dir, 'wiki_clicks_dynamic')

        # * events distance factors
        resource_id = utils.get_or_default(self.ctx, GeoEventsDistanceFactorsParameter)
        if resource_id:
            config.files['static_events_disance_factors'] = os.path.join(self.sync_resource(resource_id) + '/static_events_distance_factors_wiki')

        # * web user factors
        resource_id = utils.get_or_default(self.ctx, WebUserFactorsParameter)
        if resource_id:
            web_user_factors_dir = self.sync_resource(resource_id)
            config.directories['dynamic_user_url_factors'] = web_user_factors_dir

        # * geo.stat
        resource_id = utils.get_or_default(self.ctx, GeoStatParameter)
        if resource_id:
            geo_stat = self.sync_resource(resource_id)
            copy_path(geo_stat, os.path.join(index_dir, 'geo.stat'))

        # aux urls
        resource_id = utils.get_or_default(self.ctx, AuxUrlsParameter)
        if resource_id:
            aux_urls = self.sync_resource(resource_id)
            config.files['aux_urls'] = aux_urls

        # query rec data
        resource_id = utils.get_or_default(
            self.ctx, QueryRecDataParameter)
        if resource_id:
            config.directories['queryrec'] = self._get_unpacked_source(
                QueryRecDataParameter, 'unpacked_queryrec')

        # * indexfactorann
        resource_id = utils.get_or_default(
            self.ctx, FactorAnnParameter)
        if resource_id:
            dir = self.sync_resource(resource_id)
            files = [os.path.join(dir, f) for f in os.listdir(dir)]
            assert len(files) == 2, 'expected resourse with 2 files'
            prefix = files[0][:-4]
            assert prefix + '.idx' in files and prefix + '.dat' in files, 'expected .idx and .dat files with same name'
            config.files['factor_annotations'] = prefix

        resource_id = utils.get_or_default(
            self.ctx, FilteringAnnParameter)
        if resource_id:
            config.files['filtration_annotations'] = self.sync_resource(resource_id)

        config.options['factor_annotations_threads'] = utils.get_or_default(
            self.ctx, FactorAnnThreadsParameter)

        # * static_factors.mms
        resource_id = utils.get_or_default(
            self.ctx, StaticFactorsMapParameter)
        if resource_id:
            config.files['static_factors_yt'] = self.sync_resource(resource_id)

        # fetch indexer executable
        indexer = self.sync_resource(utils.get_or_default(self.ctx, IndexerParameter))

        # save config
        config_file = 'config.xml'
        config_data = config.dump()
        with open(config_file, 'w') as fd:
            fd.write(config_data)
        shutil.copy(config_file, get_logs_folder())

        # run!
        run_process([indexer, config_file], log_prefix='indexer')

        self.gen_urls_file(index_dir)

        # publish index without tar
        self.create_resource(self.descr, index_dir, resource_types.MAPS_DATABASE_POI)

    def gen_urls_file(self, index_dir):
        extracturls_id = utils.get_or_default(self.ctx, ExtractUrlsParameter)
        if not extracturls_id:
            return

        extracturls = self.sync_resource(extracturls_id)
        output_file = 'urls.gz'
        run_process([
            extracturls,
            os.path.join(index_dir, 'poi_storage.mms'),
            os.path.join(index_dir, 'canonizer.zz'),
            output_file
        ], log_prefix='extracturls')
        BuildHelper.create_index_resource(self,
                                          primary_source_resource_id=self.ctx.get(PoisrcSourceParameter.name, None),
                                          index_directory=output_file,
                                          index_type=resource_types.MAPS_WEB_URLS_POI)


__Task__ = BuildMapsDatabasePoi
