# -*- coding: utf-8 -*-

from sandbox import common
import sandbox.common.types.misc as ctm
import datetime
import gzip
import logging
import os
from sandbox import sdk2
import xml.sax.saxutils
import urllib

from sandbox.sandboxsdk.environments import PipEnvironment

from sandbox.projects.answers import resources
from sandbox.projects.answers.common.sitemap import Series

from sandbox.projects.common.nanny.nanny import ReleaseToNannyTask2

from sandbox.sdk2 import parameters
from sandbox.sdk2.helpers import subprocess


TAGS_SELECTOR = '''
SELECT tag FROM `{dump_dir}/answers__tags`
WHERE questions_count > 2
'''


CATEGORIES_SELECTOR = '''
SELECT slug FROM `{dump_dir}/answers__categories`
'''


QUESTIONS_SELECTOR = '''
SELECT slug FROM `{dump_dir}/answers__questions`
WHERE status = 0 AND answers_count > 0
'''


COMMUNITIES_SELECTOR = '''
SELECT
coalesce(short_url, id)
FROM `{dump_dir}/answers__communities`
WHERE status = 0
AND members_count > 1
'''


USERS_SELECTOR = '''
SELECT public_id FROM `{dump_dir}/answers__users`
WHERE public_id IS NOT NULL
'''


ARTICLES_SELECTOR = '''
SELECT
slug
FROM `{dump_dir}/answers__articles`
WHERE status = 0
'''


def url_encode(s):
    return urllib.quote(s.encode('utf-8'))


def bytes_in(s):
    return len(s.encode('utf-8'))


class QSiteMapTask(sdk2.Task, ReleaseToNannyTask2):
    HOST = "https://yandex.ru/q"
    TLDS = ["by", "ru", "ua", "kz", "com.tr", "com"]
    LASTMOD_FORMAT = "%Y-%m-%dT%H:%M:%SZ"

    class Requirements(sdk2.Task.Requirements):
        environments = (
            PipEnvironment('yandex-yt'),
            PipEnvironment("yandex-yt-yson-bindings-skynet"),
            PipEnvironment('retry'),
            PipEnvironment('unidecode'),
        )

    class Parameters(sdk2.Parameters):
        ramdrive_size = parameters.Integer('RamDrive size in GB', default=4)
        env = parameters.String(
            'Database environment',
            choices=[
                ('dev', resources.Environments.DEV),
                ('prod', resources.Environments.PROD),
            ],
            required=True,
        )
        yt_cluster = parameters.String('YtCluster', default='arnold')
        yt_clique = parameters.String('yt_clique', default='*answers')
        yt_token_name = parameters.String('yt_token', default='answers_robot_yt_token')
        yt_token_owner = parameters.String('yt_token_owner', default='YASAP')

# Event handlers
    def on_enqueue(self):
        sdk2.Task.on_enqueue(self)
        if self.Parameters.ramdrive_size:
            self.Requirements.ramdrive = ctm.RamDrive(
                ctm.RamDriveType.TMPFS,
                int(self.Parameters.ramdrive_size) << 10,
                None
            )

    def on_release(self, parameters):
        sdk2.Task.on_release(self, parameters)
        ReleaseToNannyTask2.on_release(self, parameters)

    def on_execute(self):
        import yt.clickhouse as chyt
        import yt.wrapper as yt
        self.task_dir = os.getcwd()
        self.setup_ramdrive()

        self.sitemap_dir = "sitemap"
        os.mkdir(self.sitemap_dir)

        self.client = yt.YtClient(
            self.Parameters.yt_cluster,
            token=sdk2.Vault.data(self.Parameters.yt_token_owner, self.Parameters.yt_token_name)
        )

        self.questions = self.fetch_table(QUESTIONS_SELECTOR, chyt)
        self.categories = self.fetch_table(CATEGORIES_SELECTOR, chyt)
        self.tags = self.fetch_table(TAGS_SELECTOR, chyt)
        self.communities = self.fetch_table(COMMUNITIES_SELECTOR, chyt)
        self.users = self.fetch_table(USERS_SELECTOR, chyt)
        self.articles = self.fetch_table(ARTICLES_SELECTOR, chyt)

        self.populate_index()

        tar_path = self.create_tar()

        self.log_memory_usage()

        resource_metadata = resources.QSiteMap(
            self,
            "Q sitemap by {}".format(
                datetime.datetime.utcnow().strftime(self.LASTMOD_FORMAT)
            ),
            tar_path,
        )

        resource_data = sdk2.ResourceData(resource_metadata)
        resource_data.ready()

    def setup_ramdrive(self):
        if self.ramdrive:
            logging.info(
                'Setup RamDrive size: %s path: %s',
                common.utils.size2str(self.ramdrive.size << 20),
                self.ramdrive.path,
            )
            os.chdir(str(self.ramdrive.path))

    def fetch_table(self, query_template, chyt):
        result = []

        if self.Parameters.env == resources.Environments.PROD:
            dump_dir = '//home/answers/pg_dumps/production/latest'
        elif self.Parameters.env == resources.Environments.DEV:
            dump_dir = '//home/answers/pg_dumps/prestable/latest'
        else:
            raise Exception('env not found')

        query = query_template.format(dump_dir=dump_dir)

        for d in chyt.execute(
            query,
            alias=self.Parameters.yt_clique,
            client=self.client
        ):
            item = list(d.values())[0].strip()
            if item:
                result.append(item)

        logging.info(
            'CHYT respond contains %d items',
            len(result)
        )

        return result

    def log_memory_usage(self):
        with sdk2.helpers.ProcessLog(self, logging.getLogger('df_h')) as pl:
            process = subprocess.Popen(
                "df -h".split(),
                stdout=pl.stdout,
                stderr=pl.stderr,
            )
            process.wait()

    def create_tar(self):
        tar_name = os.path.join(self.task_dir, "sitemap.tar.gz")
        with sdk2.helpers.ProcessLog(
                self, logging.getLogger('tar common sitemap')
        ) as pl:
            process = subprocess.Popen(
                [
                    "tar",
                    "-czvf",
                    tar_name,
                    self.sitemap_dir,
                ],
                stdout=pl.stdout,
                stderr=pl.stderr,
            )
            process.wait()
        return tar_name

    def populate_index(self):
        self.sitemap_names = []

        self.add_question_urls()
        self.add_category_urls()
        self.add_tag_urls()
        self.add_community_urls()
        self.add_user_urls()
        self.add_article_urls()

        self.create_index()

    def create_index(self):
        index_content = u"""<?xml version="1.0" encoding="UTF-8"?>
        <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">{values}</sitemapindex>""".format(
            values=''.join(
                [
                    u"<sitemap><loc>{name}</loc></sitemap>".format(
                        name="{host}/{sitemap_dir}/{name}".format(
                            host=self.HOST,
                            sitemap_dir=self.sitemap_dir,
                            name=name,
                        )
                    ) for name in self.sitemap_names
                ]
            )
        )
        name = 'sitemap-index.xml'
        filename = os.path.join(self.sitemap_dir, name)
        with open(filename, 'w') as f:
            f.write(index_content.encode('utf-8'))

    def add_urls(self, urls, entity):
        for url in urls:
            url['loc'] = xml.sax.saxutils.escape(url['loc'])
        urls = [
            u'<url>{elements}</url>'.format(
                elements=''.join(
                    u'<{key}>{value}</{key}>'.format(
                        key=key,
                        value=value,
                    ) for key, value in data.iteritems()
                )
            ) for data in urls
        ]
        series = Series(urls)
        for i, sitemap in enumerate(series.sitemaps, 1):
            filename = 'sitemap-{entity}-{num}.xml.gz'.format(entity=entity, num=i)
            self.sitemap_names.append(filename)
            filename = os.path.join(self.sitemap_dir, filename)
            with gzip.open(filename, 'wb') as f:
                f.write(sitemap.data.encode('utf-8'))

    def add_question_urls(self):
        urls = [
            {
                'loc': self.get_question_url(host=self.HOST, question_identity=question),
            } for question in self.questions
        ]
        self.add_urls(urls, 'questions')

    def get_question_url(self, host, question_identity):
        return u"{host}/question/{question}/".format(host=host, question=question_identity)

    def add_category_urls(self):
        urls = [
            {
                'loc': self.get_category_url(host=self.HOST, category_identity=category),
            } for category in self.categories
        ]
        self.add_urls(urls, 'categories')

    def get_category_url(self, host, category_identity):
        return u"{host}/category/{category}/".format(host=host, category=category_identity)

    def add_article_urls(self):
        urls = [
            {
                'loc': self.get_article_url(host=self.HOST, slug=slug),
            } for slug in self.articles
        ]
        self.add_urls(urls, 'articles')

    def get_article_url(self, host, slug):
        return u"{host}/article/{slug}/".format(host=host, slug=slug)

    def add_tag_urls(self):
        urls = [
            {
                'loc': self.get_tag_url(host=self.HOST, tag_identity=tag),
            } for tag in self.tags
        ]
        self.add_urls(urls, 'tags')

    def get_tag_url(self, host, tag_identity):
        return u"{host}/tag/{tag}/".format(host=host, tag=url_encode(tag_identity))

    def add_community_urls(self):
        urls = [
            {
                'loc': self.get_community_url(host=self.HOST, short_url=short_url),
            } for short_url in self.communities
        ]
        self.add_urls(urls, 'communities')

    def get_community_url(self, host, short_url):
        return u"{host}/loves/{short_url}/".format(host=host, short_url=short_url)

    def add_user_urls(self):
        urls = [
            {
                'loc': self.get_user_url(host=self.HOST, public_id=public_id),
            } for public_id in self.users
        ]
        self.add_urls(urls, 'users')

    def get_user_url(self, host, public_id):
        return u"{host}/profile/{public_id}/".format(host=host, public_id=public_id)


__TASK__ = QSiteMapTask
