# -*- coding: utf-8 -*-

import os
import tarfile

from sandbox import sdk2, common
import sandbox.common.types.task as ctt
import sandbox.common.types.resource as ctr

from sandbox.projects import resource_types

RU_DOMAIN = "ru"
TR_DOMAIN = "tr"


class BaseBuildPumpkinSerp(sdk2.Task):
    class Parameters(sdk2.Task.Parameters):
        domain = sdk2.parameters.String(
            "Domain", default=RU_DOMAIN, required=True,
            choices=[
                ('yandex.ru', RU_DOMAIN),
                ('yandex.com.tr', TR_DOMAIN),
            ]
        )
        series_name = sdk2.parameters.String("Name of the series", default_value="generalized", required=True)
        urls_volumes = sdk2.parameters.Integer("Number of output url resource volumes", default_value=5, required=True)
        top_queries = sdk2.parameters.Integer("Number of queries to collect SERPs for", required=False)
        development_mode = sdk2.parameters.Bool("Development mode", default=True)
        run_test = sdk2.parameters.Bool("Run test task", default_value=False)
        connections_limit = sdk2.parameters.Integer("Limit of simultaneous downloads", default=100, required=True)
        zip_only = sdk2.parameters.Bool("Store zipped pages only", default_value=True)
        add_cgi_params = sdk2.parameters.String("CGI parameters to be added to every request. Example: nocache=da&sbh=1")
        serp_header_delimeter = sdk2.parameters.String("Delimeter between the header and body of serp")

    INDEX_RESOURCE_TYPE = resource_types.PUMPKIN_INDEX
    ARCHIVE_RESOURCE_TYPE = resource_types.PUMPKIN_INDEX_ARCHIVE

    def on_execute(self):
        with self.memoize_stage.run_serp_collector:
            domain = self.Parameters.domain
            production_attrs = self._production_attributes(domain)
            index_resource = self._find_last_resource(self.INDEX_RESOURCE_TYPE, production_attrs)
            if not index_resource:
                raise common.errors.TaskFailure("Unable to find last production index resource")
            self.Context.index_resource_id = index_resource.id
            index_path = str(sdk2.ResourceData(index_resource).path)

            urls_archive_resource = self._make_archive(os.path.join(index_path, "topqueries.txt"))
            self.Context.urls_resource_id = urls_archive_resource.id

            index_archive_resource = sdk2.Resource.find(
                self.ARCHIVE_RESOURCE_TYPE, task=index_resource.task, state=ctr.State.READY).first()
            self.Context.index_archive_resource_id = index_archive_resource.id

            collector_task = self._serp_collector(urls_archive_resource, index_resource, index_archive_resource)
            collector_task.enqueue()
            self.Context.collector_task_id = collector_task.id

            raise sdk2.WaitTask([collector_task.id], ctt.Status.Group.FINISH)

        collector_task = sdk2.Task[self.Context.collector_task_id]
        if collector_task.status != ctt.Status.SUCCESS:
            raise common.errors.TaskFailure("Subtask failed")

        if not self.Parameters.run_test:
            return

        serps_archive_resource = sdk2.Resource.find(
            resource_types.SERP_COLLECTOR_LOOT,
            task_id=collector_task.id).first()
        if not serps_archive_resource:
            raise common.errors.TaskFailure("No serps archive was created")

        with self.memoize_stage.run_test_task:
            test_task = self._make_test_task(
                self.Context.urls_resource_id, serps_archive_resource.id)
            test_task.enqueue()
            self.Context.test_task_id = test_task.id
            raise sdk2.WaitTask([test_task.id], ctt.Status.Group.FINISH)

        index_arhive_resource = sdk2.Resource.find(
            self.ARCHIVE_RESOURCE_TYPE,
            id=self.Context.index_archive_resource_id
        ).first()

        test_task = sdk2.Task[self.Context.test_task_id]
        if test_task.status == ctt.Status.FAILURE:
            serps_archive_resource.status = "TEST_FAILED"
            index_arhive_resource.status = "TEST_FAILED"
        elif test_task.status == ctt.Status.SUCCESS:
            serps_archive_resource.status = "TEST_OK"
            index_arhive_resource.status = "TEST_OK"

    def _find_last_resource(self, resource, attrs):
        return sdk2.Resource.find(
            resource,
            attrs=attrs,
            state=ctr.State.READY
        ).first()

    def _make_archive(self, queries_path):
        volume_count = self.Parameters.urls_volumes
        top_queries = self.Parameters.top_queries
        domain = self.Parameters.domain

        archive_dir = 'urls_archive'
        os.makedirs(archive_dir)

        out_files = [open(os.path.join(archive_dir, str(n)), "w") for n in range(volume_count)]
        with open(queries_path) as input_file:
            for number, line in enumerate(input_file):
                if top_queries and number >= top_queries:
                    break
                query = line.strip().split('\t')[1]

                out_files[number % volume_count].write(self._make_serp_url(domain, query) + "\n")
                out_files[number % volume_count].write(self._make_serp_url_without_header(domain, query) + "\n")
        for out_file in out_files:
            out_file.close()

        urls_archive_name = "archive_urls_{0}.tar".format(self.Parameters.series_name)
        urls_archive_resource = resource_types.SERP_COLLECTOR_URLS(self, "Archive for {}".format(self.id), urls_archive_name)
        rd = sdk2.ResourceData(urls_archive_resource)
        with tarfile.open(str(rd.path), "w") as tar:
            tar.add("urls_archive", ".")

        rd.ready()

        return urls_archive_resource

    def _production_attributes(self, domain):
        return dict([self._make_production_attributes(domain)])

    def _make_production_attributes(self, domain):
        raise NotImplementedError()

    def _make_serp_url(self, domain, text):
        raise NotImplementedError()

    def _make_serp_url_without_header(self, domain, text):
        return self._make_serp_url(domain, text)

    def _make_main_page_url(self, domain):
        raise NotImplementedError()

    def _make_notfound_page_url(self, domain):
        raise NotImplementedError()

    def _make_notfound_page_url_without_header(self, domain):
        raise self._make_notfound_page_url(domain)

    def _make_test_task(self, urls_archive, serps_archive):
        raise NotImplementedError()

    def _make_url(self, domain):
        return None

    def _serp_collector(self, urls_archive_resource, index_resource, index_archive_resource):
        from projects.SerpCollector import SerpCollector

        domain = self.Parameters.domain

        attributes = {
            'index_resource_id': index_resource.id,
            'index_archive_id': index_archive_resource.id
        }
        if not self.Parameters.development_mode:
            production_attributes = self._make_production_attributes(domain)
            attributes[production_attributes[0]] = production_attributes[1]
            attributes['production_mode'] = '1'

        collector_class = sdk2.Task[SerpCollector.type]
        task = collector_class(
            self,
            description="Collector for {}".format(self.id),
            UrlsResource=urls_archive_resource.id,
            MainPageUrl=self._make_main_page_url(domain),
            NotFoundPageUrl=self._make_notfound_page_url(domain),
            NotFoundPageUrlWithoutHeader=self._make_notfound_page_url_without_header(domain),
            SeriesName=self.Parameters.series_name,
            ResourceAttrs=','.join("{0}={1}".format(k, v) for k, v in attributes.iteritems()),
            DisregardTime=self.Parameters.development_mode,
            ZipOnly=self.Parameters.zip_only,
            AddCGIParams=self.Parameters.add_cgi_params,
            ConnsLimit=self.Parameters.connections_limit,
            SerpHeaderDelimeter=self.Parameters.serp_header_delimeter,
        )

        return task
