import six
import logging
import os.path
from math import ceil

from sandbox.common import fs
from sandbox.projects import resource_types
from sandbox.projects.common.arcadia import sdk
from sandbox.projects.common.build.ArcadiaTask import ArcadiaTask
import sandbox.projects.common.constants as consts
from sandbox.projects.querysearch_saas.prepare_saas_querysearch_data import PrepareSaasQuerysearchData
from sandbox.projects.common.utils import get_or_default, amount_of_working_subtasks, wait_subtasks_stop
from sandbox.sandboxsdk import parameters, environments
from sandbox.sandboxsdk.channel import channel
from sandbox.sandboxsdk.process import run_process
from sandbox.sandboxsdk.paths import get_logs_folder
from sandbox.sandboxsdk.svn import Arcadia

import sandbox.common.types.client as ctc

from sandbox.projects.Org1SerpDownloader import Org1SerpDownloader as dloader
from sandbox.projects.BuildAddrSnippetData import PrepareSaasQuerydata, UpdateSaasQuerydataProd


PROGRESS = 'progress'
BACKA_ADDRSNIP_EXPORT = "http://export.backa.yandex.ru/address.snippets.merged.csv.gz"


class Org1AggrDataBuilder(ArcadiaTask, PrepareSaasQuerysearchData):

    class ThreadsNum(parameters.SandboxIntegerParameter):
        name = 'threads_num'
        description = 'Number of threads for requesting web search'
        default_value = 1
        required = True

    class PortionSize(parameters.SandboxIntegerParameter):
        name = 'portion_size'
        description = 'Number of oids to be processed as one portion'
        default_value = 10000
        required = True

    class OidsLimit(parameters.SandboxIntegerParameter):
        name = 'oids_limit'
        description = 'Maximum number of oids to process (0 = all)'
        default_value = 100
        required = False

    class SourceTask(parameters.TaskSelector):
        name = 'get_child_tasks_from'
        description = 'Task to take hamster results from'
        required = False
        task_type = "ORG1_AGGR_DATA_BUILDER"

    class UseShortCompanyName(parameters.SandboxBoolParameter):
        name = 'use_shortname'
        description = 'Use short company name for web request'
        default_value = False

    class UseShortAddress(parameters.SandboxBoolParameter):
        name = 'use_short_addr'
        description = 'Use short company address for web request'
        default_value = False

    class UseCachedGeojson(parameters.SandboxBoolParameter):
        name = 'use_cached_geojson'
        description = 'Use geojson from address snippet data (do not request geosearch)'
        default_value = False

    type = "ORG1_AGGR_DATA_BUILDER"
    input_parameters = (ThreadsNum, PortionSize, OidsLimit, SourceTask, PrepareSaasQuerydata, UpdateSaasQuerydataProd,
                        UseCachedGeojson, UseShortCompanyName, UseShortAddress) \
                        + PrepareSaasQuerysearchData.input_parameters
    required_ram = 30 * 1024
    environment = (
        environments.PipEnvironment('yandex-yt', use_wheel=True),
    )
    client_tags = ctc.Tag.Group.LINUX

    ALL_COMP_FILE = 'all_companies'
    OID2URLS_FILE = 'ru_oid2urls.tsv'

    BINARY_DIR = './bin'
    TOOLS_PATH = 'search/geo/tools/org1_wizard_by_aggregators'
    BUILD_JSON_PATH = TOOLS_PATH + '/build_json_from_snippet_tsv'
    NORMALIZE_URL_PATH = TOOLS_PATH + '/normalize_url'
    URL2DOCID_PATH = 'yweb/querydata/tools/url2docid'
    QDATA_INDEXER_PATH = 'yweb/querydata/querydata_indexer'
    QDATA_VIEWER_PATH = 'yweb/querydata/querydata_viewer'
    BACKA_AGGREGATORS_RESID = 238698085

    DOCID2URL = {}

    DOCID2OID = 'docid2oid.txt'
    DOCID2JSON = 'docid2json.txt'

    ALL_COMP_RESID = 'all_companies_res_id'

    def run_process(self, *args, **kwargs):
        """ run with logging to errorlog """
        with open(os.path.join(get_logs_folder(), 'stderr.log'), 'a') as errorlog:
            kwargs['stderr'] = errorlog
            return run_process(*args, **kwargs)

    def run_subtasks(self):
        can_run = get_or_default(self.ctx, self.ThreadsNum) - amount_of_working_subtasks()
        if can_run > 0:
            for i, p_item in enumerate(self.ctx[PROGRESS]):
                if p_item['task_id'] == 0:
                    p_item['task_id'] = self.create_subtask(
                        task_type=dloader.type,
                        description="downloader for portion #{}/{}".format(i + 1, len(self.ctx[PROGRESS])),
                        input_parameters={
                            dloader.CompaniesResourceId.name: self.ctx[self.ALL_COMP_RESID],
                            dloader.RangeStart.name: p_item['range'][0],
                            dloader.RangeEnd.name: p_item['range'][1],
                            dloader.ToolsPath.name: self.TOOLS_PATH,
                            dloader.UseShortCompanyName.name: get_or_default(self.ctx, self.UseShortCompanyName),
                            dloader.UseShortAddress.name: get_or_default(self.ctx, self.UseShortAddress),
                            'notify_if_finished': '',
                        },
                    ).id
                    can_run -= 1
                    if can_run == 0:
                        break

        wait_subtasks_stop(wait_all=False)

    def aggregate_urls(self, out_file):
        subtasks = self.list_subtasks(load=False)
        src_task = get_or_default(self.ctx, self.SourceTask)
        if src_task:
            logging.info("Will get child tasks' resources from {}".format(src_task))
            subtasks = [task.id for task in channel.sandbox.list_tasks(parent_id=src_task)]

        with open(out_file, 'w') as fout:
            for child in subtasks:
                resources = channel.sandbox.list_resources(task_id=child, resource_type=resource_types.PLAIN_TEXT, status="READY")
                if resources:
                    path = self.sync_resource(resources[0])
                    with open(path) as fin:
                        for l in fin:
                            fout.write(l)

    def download_addrsnip_data(self):
        fs.fetch_file_via_http(BACKA_ADDRSNIP_EXPORT, "addrsnip.csv.gz")
        self.run_process('gzip -d addrsnip.csv.gz')

    def update_progress(self):
        for p_item in self.ctx[PROGRESS]:
            if p_item['task_id'] and not p_item['status']:
                state = channel.sandbox.get_task(p_item['task_id']).status
                if state in self.Status.Group.SUCCEED:
                    p_item['status'] = True
                elif state in self.Status.Group.SCHEDULER_FAILURE:
                    p_item['task_id'] = 0

    def has_work_for_subtasks(self):
        return any(p['task_id'] == 0 for p in self.ctx[PROGRESS])

    def get_urls_from_hamster(self):
        self.update_progress()
        if self.has_work_for_subtasks():
            self.run_subtasks()

    def get_companies_tsv(self, out_file):
        self.download_addrsnip_data()

        with open(out_file, 'w') as fh, open(os.path.join(get_logs_folder(), 'sort.err'), 'a') as ferr:
            run_process('/usr/bin/env bash -c "LC_ALL=C sort -t $\'\\t\' -k23 -u addrsnip.csv"', stdout=fh, stderr=ferr, shell=True)

    def save_docid2url_mapping(self, urls_file, docids_file):
        """ order is expected to be the same in both files """
        with open(urls_file) as fin_urls, open(docids_file) as fin_docids:
            while True:
                l_url = fin_urls.readline().strip()
                l_docid = fin_docids.readline().strip()
                if not l_url and not l_docid:
                    break
                elif bool(l_url) != bool(l_docid):
                    raise Exception("Url and docid files have different length")
                self.DOCID2URL[l_docid.split('\t')[1]] = l_url.split('\t')[1]

    def prepare_json(self):
        logging.info("Preparing json")

        with open('oid2urls_preflt.tsv', 'w') as fout:
            self.run_process([
                os.path.join(self.BINARY_DIR, self.TOOLS_PATH, 'org1_wizard_by_aggregators'),
                '--oids', self.OID2URLS_FILE,
                '--aggregators', self.sync_resource(self.BACKA_AGGREGATORS_RESID),
            ], stdout=fout)

        NORM_OID2URLS = "oid2urls.norm.txt"
        with open('oid2urls_preflt.tsv') as fin, open(NORM_OID2URLS, 'w') as fout:
            self.run_process(
                os.path.join(self.BINARY_DIR, self.NORMALIZE_URL_PATH, 'normalize_url') + ' --field-number 1',
                stdin=fin, stdout=fout, shell=True
            )

        DOCID2URLS = "docid2urls.txt"
        with open(NORM_OID2URLS) as fin, open(DOCID2URLS, 'w') as fout:
            self.run_process(
                os.path.join(self.BINARY_DIR, self.URL2DOCID_PATH, 'url2docid') + ' -F1',
                stdin=fin, stdout=fout, shell=True
            )
        self.mark_resource_ready(self.create_resource("docid2urls trie", DOCID2URLS, resource_types.PLAIN_TEXT))

        self.save_docid2url_mapping(NORM_OID2URLS, DOCID2URLS)

        with open(DOCID2URLS) as fin, open(self.DOCID2OID, 'w') as fout:
            self.run_process(
                os.path.join(self.BINARY_DIR, self.TOOLS_PATH, 'org1_wizard_by_aggregators'),
                stdin=fin, stdout=fout, shell=True
            )

        # self.mark_resource_ready(self.create_resource("normalized trie", self.DOCID2OID, resource_types.OTHER_RESOURCE))

        self.download_addrsnip_data()

        with open(self.DOCID2JSON, 'w') as fout:
            params = [
                os.path.join(self.BINARY_DIR, self.BUILD_JSON_PATH, 'build_json_from_snippet_tsv'),
                '--addrsnippet_data', 'addrsnip.csv',
                '--url_data', self.DOCID2OID,
            ]
            self.run_process(params, stdout=fout, shell=True)

    def get_src_dir(self):
        return Arcadia.get_arcadia_src_dir(Arcadia.trunk_url())

    def do_build(self, targets):
        sdk.do_build(
            consts.YMAKE_BUILD_SYSTEM,
            self.get_src_dir(),
            targets,
            consts.RELEASE_BUILD_TYPE,
            results_dir=self.BINARY_DIR,
            clear_build=False
        )

    def initCtx(self):
        self.ctx['kill_timeout'] = 60 * 60 * 24

    def get_data_for_yt(self):
        with open(self.DOCID2JSON) as fin:
            for line in fin:
                docid, json_data = line.strip().split('\t')
                yield {
                    "Subkey_Url": self.DOCID2URL[docid],  # it must exist, so it's ok to get KeyError here
                    "Data_JSON": json_data
                }

    def on_execute(self):
        if not self.ctx['get_child_tasks_from']:
            if not self.ctx.get(PROGRESS, []):
                logging.info("ENTER get_companies")
                self.get_companies_tsv(self.ALL_COMP_FILE)
                comp = self.create_resource("ALL companies", self.ALL_COMP_FILE, resource_types.GEOSEARCH_ORG1_RAW_COMPANIES_INFO)
                self.mark_resource_ready(comp)
                self.ctx[self.ALL_COMP_RESID] = comp.id

                oids_to_process = int(self.run_process(
                    'wc -l {} | cut -f1 -d" "'.format(self.ALL_COMP_FILE), shell=True, outs_to_pipe=True
                ).communicate()[0])
                oids_limit = get_or_default(self.ctx, self.OidsLimit)
                if oids_limit:
                    oids_to_process = min(oids_to_process, oids_limit)

                psize = get_or_default(self.ctx, self.PortionSize)
                pnum = ceil(float(oids_to_process) / psize)
                self.ctx[PROGRESS] = []
                for i in six.moves.xrange(int(pnum)):
                    range_start = psize * i
                    range_end = oids_to_process if (i == pnum - 1) else psize * (i + 1)
                    self.ctx[PROGRESS].append({
                        'range': (range_start, range_end),
                        'status': False,
                        'task_id': 0
                    })

            self.get_urls_from_hamster()

        logging.info("ENTER aggregate")
        self.aggregate_urls(self.OID2URLS_FILE)
        self.do_build([
            self.TOOLS_PATH,
            self.BUILD_JSON_PATH,
            self.NORMALIZE_URL_PATH,
            self.URL2DOCID_PATH,
            self.QDATA_VIEWER_PATH,
        ])
        self.prepare_json()
        if self.ctx.get(PrepareSaasQuerydata.name, False):
            PrepareSaasQuerysearchData.upload_to_yt(
                self,
                self.get_data_for_yt(), 'url_related_oids', self.get_vault_data('grand', 'yt_token')
            )
            if self.ctx.get(UpdateSaasQuerydataProd.name, False):
                PrepareSaasQuerysearchData.on_release(self, False)


__Task__ = Org1AggrDataBuilder
