# -*- coding: utf-8 -*-

from sandbox import common
import sandbox.common.types.misc as ctm
import datetime
import gzip
import logging
import os
import re
from sandbox import sdk2
import xml.sax.saxutils
import urllib

from sandbox.sandboxsdk.environments import PipEnvironment

from sandbox.projects.answers import resources
from sandbox.projects.answers.common.encrypt_mixin import EncryptMixin, GpgSettings
from sandbox.projects.common.nanny.nanny import ReleaseToNannyTask2
from sandbox.projects.answers.common.psql import PostgreSQLStuff, PsqlContextWrapper, Config
from sandbox.projects.answers.common.sitemap import Series


from sandbox.sdk2 import parameters
from sandbox.sdk2.helpers import subprocess


ru_sub = {
    "а": "a",
    "б": "b",
    "в": "v",
    "г": "g",
    "д": "d",
    "е": "e",
    "ё": "yo",
    "ж": "zh",
    "з": "z",
    "и": "i",
    "й": "i",
    "к": "k",
    "л": "l",
    "м": "m",
    "н": "n",
    "о": "o",
    "п": "p",
    "р": "r",
    "с": "s",
    "т": "t",
    "у": "u",
    "ф": "f",
    "х": "kh",
    "ц": "c",
    "ч": "ch",
    "ш": "sh",
    "щ": "sh",
    "ъ": "",
    "ы": "y",
    "ь": "",
    "э": "e",
    "ю": "yu",
    "я": "ya",
    "А": "A",
    "Б": "B",
    "В": "V",
    "Г": "G",
    "Д": "D",
    "Е": "E",
    "Ё": "Yo",
    "Ж": "Zh",
    "З": "Z",
    "И": "I",
    "Й": "I",
    "К": "K",
    "Л": "L",
    "М": "M",
    "Н": "N",
    "О": "O",
    "П": "P",
    "Р": "R",
    "С": "S",
    "Т": "T",
    "У": "U",
    "Ф": "F",
    "Х": "Kh",
    "Ц": "C",
    "Ч": "Ch",
    "Ш": "Sh",
    "Щ": "Sh",
    "Ъ": "",
    "Ы": "Y",
    "Ь": "",
    "Э": "E",
    "Ю": "Yu",
    "Я": "Ya",
}


re_non_authorized_chars = re.compile("[^a-z0-9-_]")
re_multiple_dashes = re.compile("-+")


# Substitute returns string with superseded all substrings from
# provided substitution map. Substitution map will be applied in alphabetic
# order. Many passes, on one substitution another one could apply.
def substitute(text, map):
    for key, replacement in map.items():
        text = text.replace(key, replacement)

    return text


def slugify(text):
    from unidecode import unidecode
    if not isinstance(text, unicode):
        text = text.decode('utf-8')

    slug = text.strip()

    slug = substitute(slug, ru_sub)

    # Process all non ASCII symbols
    slug = unidecode(slug)

    slug = slug.lower()

    # Process all remaining symbols
    slug = re.sub(re_non_authorized_chars,  "-", slug)

    slug = re.sub(re_multiple_dashes, "-", slug)
    slug = slug.strip("-_")

    return slug


QUESTIONS_SELECTOR = '''
SELECT id FROM answers__questions
WHERE answers__questions.status = 0 AND answers__questions.answers_count > 0 AND answers__questions.is_thequestion IS TRUE
'''


def url_encode(s):
    return urllib.quote(s.encode('utf-8'))


def bytes_in(s):
    return len(s.encode('utf-8'))


class AnswersTheQuestionSiteMapTask(sdk2.Task, ReleaseToNannyTask2, EncryptMixin):
    HOST = "https://thequestion.ru"
    TLDS = ["by", "ru", "ua", "kz", "com.tr", "com"]
    LASTMOD_FORMAT = "%Y-%m-%dT%H:%M:%SZ"

    class Requirements(sdk2.Task.Requirements):
        environments = (
            PipEnvironment('yandex-yt'),
            PipEnvironment("yandex-yt-yson-bindings-skynet"),
            PipEnvironment('retry'),
            PipEnvironment('unidecode'),
        )

    class Parameters(sdk2.Parameters):
        use_last_dump = parameters.Bool(
            'Use last database dump',
            required=True,
            default=True,
        )
        with use_last_dump.value[False]:
            psql_dump = parameters.Resource(
                'Dump of Answers PSQL',
                resource_type=resources.AnswersPostgresqlDump,
                required=True,
            )
        postgres_resource = parameters.Resource(
            'Resource with PSQL',
            resource_type=resources.AnswersPostgresql,
            required=True,
        )
        ramdrive_size = parameters.Integer('RamDrive size in GB', default=4)
        gpg_key_owner = parameters.String(
            'Gpg Key Owner',
            required=True,
        )
        env = parameters.String(
            'Database environment',
            choices=[
                ('dev', resources.Environments.DEV),
                ('prod', resources.Environments.PROD),
            ],
            required=True,
        )

# Event handlers
    def on_enqueue(self):
        sdk2.Task.on_enqueue(self)
        if self.Parameters.ramdrive_size:
            self.Requirements.ramdrive = ctm.RamDrive(
                ctm.RamDriveType.TMPFS,
                int(self.Parameters.ramdrive_size) << 10,
                None
            )

    def on_release(self, parameters):
        sdk2.Task.on_release(self, parameters)
        ReleaseToNannyTask2.on_release(self, parameters)

    def on_execute(self):
        self.task_dir = os.getcwd()
        settings = GpgSettings(
            key_owner='YASAP',
            secret_key_name='answers_pgp_private_key',
            public_key_name='answers_pgp_public_key',
            recipient=self.Parameters.gpg_key_owner,
        )
        self.setup_ramdrive()
        local_psql_path = str(
            sdk2.ResourceData(
                sdk2.Resource[self.Parameters.postgres_resource]
            ).path
        )
        encrypted_dump_path = self.get_encrypted_dump_path()
        local_dump_path = self.decrypt(encrypted_dump_path, settings)
        psql_workdir = 'psql_data'
        self.psql, psql_config = self.config_psql(psql_workdir, local_psql_path)

        with PsqlContextWrapper(self.psql):
            self.restore_pg(local_dump_path, psql_config, local_psql_path)
            self.sitemap_dir = "sitemap"
            self.lastmod = datetime.datetime.utcnow().strftime(self.LASTMOD_FORMAT)
            os.mkdir(self.sitemap_dir)

            self.questions = self.fetch_table(psql_config, local_psql_path, QUESTIONS_SELECTOR)

            self.populate_index()

        tar_path = self.create_tar()

        self.log_memory_usage()

        resource_metadata = resources.AnswersTheQuestionSiteMap(
            self,
            "Answers the question sitemap by {}".format(
                datetime.datetime.utcnow().strftime(self.LASTMOD_FORMAT)
            ),
            tar_path,
        )
        resource_data = sdk2.ResourceData(resource_metadata)
        resource_data.ready()

    def on_break(self, prev_status, status):
        if hasattr(self, 'psql'):
            self.psql._kill()

    def setup_ramdrive(self):
        if self.ramdrive:
            logging.info(
                'Setup RamDrive size: %s path: %s',
                common.utils.size2str(self.ramdrive.size << 20),
                self.ramdrive.path,
            )
            os.chdir(str(self.ramdrive.path))

    def config_psql(self, workdir, psql_path):
        config = Config(work_dir=workdir)
        psql = PostgreSQLStuff(config, psql_path)
        return psql, config

    def fetch_table(self, psql_config, psql_path, selector):
        with sdk2.helpers.ProcessLog(
                self, logging.getLogger('psql_fetch')
        ) as pl:
            fetch = subprocess.Popen(
                [
                    os.path.join(psql_path, 'bin/psql'),
                    '--dbname={}'.format(psql_config.dbname),
                    '--host={}'.format('localhost'),
                    '--port={}'.format(psql_config.port),
                    '--username={}'.format(psql_config.username),
                    '-t',
                    '-c',
                    selector
                ],
                stdin=subprocess.PIPE,
                stdout=subprocess.PIPE,
                stderr=pl.stderr,
            )
            rows = fetch.communicate(psql_config.password)[0].split('\n')
            rows = filter(bool, rows)
            exitcode = fetch.wait()
            if exitcode:
                raise Exception(
                    'Failed to fetch database table, exitcode: {}'.format(
                        exitcode
                    )
                )

            result = [row.strip() for row in rows]
            return result

    def restore_pg(self, dump_path, psql_config, psql_path):
        with sdk2.helpers.ProcessLog(
                self, logging.getLogger('psql_restore')
        ) as pl:
            restore = subprocess.Popen(
                [
                    os.path.join(psql_path, 'bin/pg_restore'),
                    '--dbname={}'.format(psql_config.dbname),
                    '--host={}'.format('localhost'),
                    '--port={}'.format(psql_config.port),
                    '--username={}'.format(psql_config.username),
                    '--password',
                    '--format=d',
                    '--schema=public',
                    '--single-transaction',
                    '--no-owner',
                    '--no-privileges',
                    dump_path,

                ],
                stdin=subprocess.PIPE,
                stdout=pl.stdout,
                stderr=pl.stderr,
            )
            restore.communicate(psql_config.password)
            exitcode = restore.wait()
            if exitcode:
                raise Exception(
                    'Failed to restore database from dump exitcode: {}'.format(
                        exitcode
                    )
                )

    def get_encrypted_dump_path(self):
        if not self.Parameters.use_last_dump:
            result = str(
                sdk2.ResourceData(
                    sdk2.Resource[self.Parameters.psql_dump]
                ).path
            )
        else:
            dump_resource = sdk2.Resource.find(
                resources.AnswersPostgresqlDump,
                attrs={'env': self.Parameters.env}
            ).order(
                -resources.AnswersPostgresqlDump.id,
            ).first()
            logging.info('found resource = %s', dump_resource)
            result = str(sdk2.ResourceData(dump_resource).path)
        return result

    def log_memory_usage(self):
        with sdk2.helpers.ProcessLog(self, logging.getLogger('df_h')) as pl:
            process = subprocess.Popen(
                "df -h".split(),
                stdout=pl.stdout,
                stderr=pl.stderr,
            )
            process.wait()

    def create_tar(self):
        tar_name = os.path.join(self.task_dir, "sitemap.tar.gz")
        with sdk2.helpers.ProcessLog(
                self, logging.getLogger('tar common sitemap')
        ) as pl:
            process = subprocess.Popen(
                [
                    "tar",
                    "-czvf",
                    tar_name,
                    self.sitemap_dir,
                ],
                stdout=pl.stdout,
                stderr=pl.stderr,
            )
            process.wait()
        return tar_name

    def populate_index(self):
        self.sitemap_names = []
        self.add_question_urls()
        self.create_index()

    def create_index(self):
        index_content = u"""<?xml version="1.0" encoding="UTF-8"?>
        <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">{values}</sitemapindex>""".format(
            values=''.join(
                [
                    u"<sitemap><loc>{name}</loc></sitemap>".format(
                        name="{host}/{sitemap_dir}/{name}".format(
                            host=self.HOST,
                            sitemap_dir=self.sitemap_dir,
                            name=name,
                        )
                    ) for name in self.sitemap_names
                ]
            )
        )
        name = 'sitemap-index.xml'
        filename = os.path.join(self.sitemap_dir, name)
        with open(filename, 'w') as f:
            f.write(index_content.encode('utf-8'))

    def get_lastmod(self, lastmod):
        if lastmod:
            return lastmod.strftime(self.LASTMOD_FORMAT)
        else:
            return self.lastmod

    def add_urls(self, urls, entity):
        for url in urls:
            url['loc'] = xml.sax.saxutils.escape(url['loc'])
        urls = [
            u'<url>{elements}</url>'.format(
                elements=''.join(
                    u'<{key}>{value}</{key}>'.format(
                        key=key,
                        value=value,
                    ) for key, value in data.iteritems()
                )
            ) for data in urls
        ]
        series = Series(urls)
        for i, sitemap in enumerate(series.sitemaps, 1):
            filename = 'sitemap-{entity}-{num}.xml.gz'.format(entity=entity, num=i)
            self.sitemap_names.append(filename)
            filename = os.path.join(self.sitemap_dir, filename)
            with gzip.open(filename, 'wb') as f:
                f.write(sitemap.data.encode('utf-8'))

    def add_question_urls(self):
        urls = [
            {
                'loc': self.get_question_url(host=self.HOST, question_identity=question),
            } for question in self.questions
        ]
        self.add_urls(urls, 'questions')

    def get_question_url(self, host, question_identity):
        return u"{host}/questions/{question}/".format(host=host, question=question_identity)


__TASK__ = AnswersTheQuestionSiteMapTask
