# -*- coding: utf-8 -*-

from sandbox.projects.common.nanny import nanny
from sandbox.projects.common.apihelpers import get_last_resource_with_attrs
from sandbox.sandboxsdk import sandboxapi
from sandbox.sandboxsdk.errors import SandboxTaskFailureError
from sandbox.sandboxsdk.parameters import SandboxStringParameter, SandboxBoolParameter, SandboxIntegerParameter
from sandbox.sandboxsdk.process import run_process
from sandbox.sandboxsdk.task import SandboxTask

import json
import logging
import os
import re
import requests


class Params(object):
    class DatabaseGlobalPrefix(SandboxStringParameter):
        name = 'database_global_prefix'
        description = 'Database global url prefix'
        default_value = 'http://uspokazgen01i.yandex.ru/pokazometer/dbs'

    class DatabaseDirPrefix(SandboxStringParameter):
        name = 'database_dir_prefix'
        description = 'Database dir prefix (like "shows_")'
        default_value = 'clicks_'

    class DatabaseDateDir(SandboxStringParameter):
        name = 'database_date_dir'
        description = 'Database date dir (with prefix, like "shows_20161208")'
        default_value = ''

    class DatabaseFiles(SandboxStringParameter):
        name = 'database_files'
        description = 'Database filenames (comma separated list, without spaces)'
        default_value = 'click_hits.txt,click_hits_click_index.txt'

    class MinDbSize(SandboxIntegerParameter):
        name = 'min_db_size'
        description = 'database with size < min_db_size will not be released (simple check for db validity)'
        default_value = 1024

    class DateFile(SandboxStringParameter):
        name = 'date_file'
        description = 'Database status file with date (like "dbdate" or "built")'
        default_value = 'built'

    class ResourceType(SandboxStringParameter):
        name = 'resource_type'
        description = 'Database resource type'
        required = True
        default_value = 'POKAZOMETER_DATABASE_CLICKS'
        choises = ['POKAZOMETER_DATABASE_CLICKS', 'POKAZOMETER_DATABASE_SHOWS', 'ADVQUICK_DATABASE']

    class FindLatest(SandboxBoolParameter):
        name = 'find_latest'
        description = 'Find latest database dir automatically'
        default_value = True

    class OnlyNew(SandboxBoolParameter):
        name = 'only_new'
        description = 'Download db only if there is no sb resource with specified date'
        default_value = True

    class ReleaseTo(SandboxStringParameter):
        name = 'release_to'
        choices = [('do not release', '')] + [(x, x) for x in sandboxapi.RELEASE_STATUSES]
        description = 'Automatically release resource to'
        default_value = ''

    params = [DatabaseGlobalPrefix, DatabaseDirPrefix, DatabaseDateDir, DatabaseFiles, MinDbSize, DateFile, ResourceType, FindLatest, OnlyNew, ReleaseTo]


class GetPokazometerDatabase(SandboxTask, nanny.ReleaseToNannyTask):
    """
        Получает базу для Показометра (advq whale)
    """
    type = 'GET_POKAZOMETER_DATABASE'

    input_parameters = Params.params

    do_release_flag = 'do_release'

    def on_execute(self):
        if self.ctx.get(self.do_release_flag):
            with self.memoize_stage.wait_parent:
                self.wait_tasks(self.parent_id, (self.Status.Group.FINISH + self.Status.Group.BREAK), wait_all=True)
            self.create_release(self.parent_id, status=self.ctx[Params.ReleaseTo.name], subject='Automatic release')
            return

        self.db_global_prefix = self.ctx.get(Params.DatabaseGlobalPrefix.name)
        self.db_dir_prefix = self.ctx.get(Params.DatabaseDirPrefix.name)
        self.db_date_dir = self.ctx.get(Params.DatabaseDateDir.name)
        self.db_files = self.ctx.get(Params.DatabaseFiles.name).split(',')
        self.min_db_size = int(self.ctx.get(Params.MinDbSize.name))
        self.date_file = self.ctx.get(Params.DateFile.name)
        self.resource_type = self.ctx.get(Params.ResourceType.name)
        find_latest = self.ctx.get(Params.FindLatest.name)
        only_new = self.ctx.get(Params.OnlyNew.name)

        if find_latest:
            (self.file_to_url, self.db_date) = self.get_latest_db()
        else:
            (self.file_to_url, self.db_date) = self.get_db_sizes_and_date(self.db_date_dir)
        logging.info('Going to download databases for %s (file: [url, dbsize]): %s' % (self.db_date, json.dumps(self.file_to_url)))

        if only_new:
            r = get_last_resource_with_attrs(self.resource_type, {'date': self.db_date})
            if r:
                logging.warning('Resource with date %s already exists: %s' % (self.db_date, str(r)))
                # это не ошибка, поэтому просто выходим, но ничего не релизим, т.к. уже была база за такую дату
                return

        os.mkdir('dbs')
        for db_file in self.db_files:
            self.download_url(self.file_to_url[db_file][0], self.file_to_url[db_file][1], 'dbs/' + db_file)

        run_process(['tar', '-cvzf', self.db_dir_prefix + 'dbs.tgz', '-C', 'dbs/', '.'], wait=True, check=True, shell=False, log_prefix='tar')

        resource = self.create_resource(
            description=self.descr,
            resource_path=self.db_dir_prefix + 'dbs.tgz',
            resource_type=self.resource_type,
            attributes={'date': self.db_date},
        )
        self.mark_resource_ready(resource.id)

        if self.ctx.get(Params.ReleaseTo.name):
            child_params = dict(self.ctx)
            child_params.update({self.do_release_flag: True})
            self.create_subtask(self.type, 'Release ' + self.descr, child_params, priority=self.priority, execution_space=self.execution_space, inherit_notifications=True)

    def on_release(self, additional_parameters):
        nanny.ReleaseToNannyTask.on_release(self, additional_parameters)
        SandboxTask.on_release(self, additional_parameters)

    def get_latest_db(self):
        try:
            m = sorted(re.findall('"(' + self.db_dir_prefix + '\d{8})/"', requests.get(self.db_global_prefix).text))
        except Exception as e:
            raise SandboxTaskFailureError("Can't find latest dir from %s: %s" % (self.db_global_prefix, str(e)))
        else:
            if not m:
                raise SandboxTaskFailureError('Empty dir list from ' + self.db_global_prefix)

            for db_dir in reversed(m):
                try:
                    logging.info('Get db size and date from dir ' + db_dir)
                    (file_to_url, db_date) = self.get_db_sizes_and_date(db_dir)
                except Exception as e:
                    logging.warning("Bad latest db_dir (%s), try next ..." % (str(e),))
                else:
                    logging.info('Found latest dir: ' + db_dir)
                    return file_to_url, db_date

            raise SandboxTaskFailureError("Can't find latest dir with valid size and dbdate file: " + self.db_global_prefix)

    def get_db_sizes_and_date(self, db_dir):
        db_url_prefix = '/'.join((self.db_global_prefix, db_dir))
        db_date_url = '/'.join((db_url_prefix, self.date_file))
        file_to_url = {}
        try:
            db_date_text = requests.get(db_date_url).text
            logging.info('db_date_text from %s: %s' % (db_date_url, db_date_text))

            db_date = re.match('^(\d{4}-\d\d-\d\d)', db_date_text).group(0)
            for db_file in self.db_files:
                db_url = '/'.join((db_url_prefix, db_file))
                db_headers = requests.head(db_url).headers
                logging.info('db_headers from %s: %s', (db_url, str(db_headers)))

                db_size = int(db_headers.get('Content-Length'))
                if db_size <= self.min_db_size:
                    raise SandboxTaskFailureError("Db size is too small %s: %d" % (db_url, db_size))
                file_to_url[db_file] = (db_url, db_size)
        except Exception as e:
            raise SandboxTaskFailureError("Can't get db size and date for %s: %s" % (db_url_prefix, str(e)))

        return file_to_url, db_date

    def download_url(self, url, size, out_file):
        run_process(['curl', '-sS', '--retry', '3', '-w', '%{http_code} %{size_download}\n', '-o', out_file, url], wait=True, check=True, shell=False, log_prefix='curl')
        downloaded_size = os.path.getsize(out_file)

        if downloaded_size != size:
            raise SandboxTaskFailureError("Can't download full db from %s, see curl logs" % (url,))

        logging.info('Database downloaded to %s, size: %d' % (out_file, downloaded_size))


__Task__ = GetPokazometerDatabase
