# coding: utf-8

import calendar
import datetime
import json
import logging
import re
import time

import requests

import sandbox.common.errors as errors
from sandbox.projects import resource_types
from sandbox.projects.common import cms
from sandbox.projects.common import decorators
from sandbox.projects.common import error_handlers as eh
from sandbox.projects.release_machine.core import const
from sandbox.projects.release_machine.helpers import wiki_helper
from sandbox.sandboxsdk import environments
from sandbox.sandboxsdk import parameters
from sandbox.sandboxsdk.channel import channel
from sandbox.sandboxsdk.task import SandboxTask

_DOCUMENT_NOT_FOUND = 'Not_found'
_DOCUMENT_FOUND = 'Found'
_WIKI_PAGE_RU = 'users/mvel/last-not-indexed-documents-ru/'
_WIKI_PAGE_TR = 'users/mvel/last-not-indexed-documents-tr/'


class Country(parameters.SandboxSelectParameter):
    name = 'country'
    description = 'In what country check fresh'
    default_value = 'RU'
    choices = [('RU', 'RU'), ('TR', 'TR')]


class SitaPort(parameters.SandboxIntegerParameter):
    name = 'sita_port'
    description = 'Sita port'
    group = 'Sita options'
    default_value = 12180


class SitaHost(parameters.SandboxStringParameter):
    name = 'sita_host'
    description = 'Sita host'
    group = 'Sita options'
    default_value = 'sita.yandex.net'


class SitaCGI(parameters.SandboxStringParameter):
    name = 'sita_cgi'
    description = 'Sita CGI'
    group = 'Sita options'
    default_value = 'format_output=1&enum_mode=name'


class CheckFreshDocuments(SandboxTask):
    """
    https://wiki.yandex-team.ru/jandekspoisk/refresh/monitoring-skorosti-indeksacii-svezhesti/
    """

    input_parameters = [
        SitaPort,
        SitaHost,
        SitaCGI,
        Country,
    ]
    type = 'CHECK_FRESH_DOCUMENTS'
    environment = (environments.PipEnvironment('feedparser'),)

    def on_enqueue(self):
        SandboxTask.on_enqueue(self)

        self.ctx["base_ru_res_id"] = self.create_resource(
            'FreshRSSBase', "base.txt", resource_types.FRESH_CHECK_DOCUMENTS
        ).id

    def _update_wiki_page(self, logs_data):
        session = requests.Session()
        session.headers['Authorization'] = "OAuth {}".format(self.get_vault_data(const.COMMON_TOKEN_OWNER, const.COMMON_TOKEN_NAME))
        session.verify = False

        page_url = const.Urls.WIKI_API + self.__country_settings['wiki_page']

        params = {
            'title': 'Last not indexed documents {}'.format(self.__country),
            'body': logs_data,
        }

        wiki_helper.post_on_wiki_and_check(session, page_url, params)

    def kiwi_url_canonize(self, target_link):
        logging.debug('try to canonize %s', target_link)

        data_json = {}
        data_json['Actions'] = [{
            'Type': u'AT_URL_CANONIZATION',
            'UrlCanonizationData': {
                'CanonizationType': u'STRONG'
            }
        }]
        data_json['AuthInfo'] = {
            'Type': u'BASIC',
            'User': u'any'
        }
        data_json['Data'] = [{
            'Url': u'http://ya.ru'
        }]
        data_json['Settings'] = {
            'UrlValidator': u'ROBOT_URL_VALIDATOR'
        }
        data_json['Data'][0]['Url'] = target_link

        data_for_send = json.dumps(data_json)

        # example of sita_request: http://sita.yandex.net:12180/json?format_output=1&enum_mode=name
        sita_request_string = 'http://{}:{}/json?{}'.format(self.__sita_host, str(self.__sita_port), self.__sita_cgi)
        sita_response = requests.post(sita_request_string, data_for_send)

        sita_output = sita_response.text

        logging.debug("Sita response:")
        logging.debug(sita_output)

        main_url = target_link
        try:
            json_output = json.loads(sita_output)
            main_url = json_output['Results'][0]['UrlCanonizationResult']['GeminiResponse']['CanonizedUrl'].encode()
            logging.debug("Сanonization completed: %s", main_url)
        except Exception:
            try:
                match_main_url = re.search('("MainUrl")([^"]*(")([^"]*))', sita_output)
                if match_main_url:
                    main_url = match_main_url.group(4)
                    logging.debug("Сanonization completed with trouble: %s", main_url)
            except Exception:
                logging.debug("Сanonization fail. Using original url")

        return main_url

    @decorators.retries(3, delay=5)
    def post_stats_to_solomon(self, data_for_send):
        solomon_request_string = 'http://api.solomon.search.yandex.net/push/json'
        return requests.post(solomon_request_string, headers={'Content-Type': 'application/json'}, data=data_for_send)

    def send_stats_to_solomon(self, maximum_age, problem_documents_count, average_age, all_documents_count):

        data_json = {}
        data_json['commonLabels'] = {
            'project': 'check_fresh_documents',
            'cluster': 'push',
            'service': self.__country.lower(),
        }
        data_json['sensors'] = [
            {
                'labels': {
                    'sensor': 'MaxDelayMin'
                },
                'value': maximum_age / 60
            },
            {
                'labels': {
                    'sensor': 'NumProblemDocs'
                },
                'value': problem_documents_count
            },
            {
                'labels': {
                    'sensor': 'AverDelayMin'
                },
                'value': average_age / 60
            },
            {
                'labels': {
                    'sensor': 'NumAllDocs'
                },
                'value': all_documents_count
            },
        ]

        data_for_send = json.dumps(data_json)
        logging.debug('JSON for solomon %s', data_for_send)

        solomon_response = self.post_stats_to_solomon(data_for_send)

        logging.debug('Solomon response %s', solomon_response)

    @staticmethod
    def get_instance_host_and_port():
        instance_settings = cms.get_cms_instances('MAN_WEB_FUSION_3DAY_MMETA', 'SAAS')

        logging.debug(
            "Instance_settings: host=%s; shard=%s; port=%d;",
            instance_settings[0]['host'], instance_settings[0]['shard'], instance_settings[0]['port']
        )

        return instance_settings[0]['host'], instance_settings[0]['port']

    def check_in_yandex(self, ya_host, ya_port, url, parameters):
        # example of yandex_search_address:
        # http://man1-0008.search.yandex.net:8028/yandsearch?text=url:http://lenta.ru/news/2015/11/12/pollution/&metahost2=^WEB:QUICK&pron=nometarank&ms=proto&hr=da
        yandex_search_address = '{}:{}/yandsearch'.format(ya_host, str(ya_port))

        logging.debug("Yandex_search_address URL to get serp: %s", yandex_search_address)

        parameters.update(self.__country_settings['cgi_parameters'])

        try:
            yandex_response = requests.get(yandex_search_address, params=parameters, verify=False)
        except Exception:
            logging.error("Yandex MMETA response error on url: %s", url)
            return 'Yandex_no_answer'

        yandex_search_html = yandex_response.text
        if url[-1] != '/':
            url += '/'
        tag_with_slash = 'href="{}"'.format(url).decode("utf-8")
        tag_without_slash = 'href="{}"'.format(url[:-1]).decode("utf-8")
        if tag_with_slash in yandex_search_html or tag_without_slash in yandex_search_html:
            return _DOCUMENT_FOUND
        else:
            logging.debug("Yandex have no such URL. yandex_search_html: \n%s", yandex_search_html)
            return _DOCUMENT_NOT_FOUND

    @decorators.retries(4, delay=5)
    def get_fresh_url(self, fresh_url):
        return str(requests.get(fresh_url, timeout=5).url)  # handle redirects

    @decorators.retries(4, delay=5)
    def get_feed(self, rss_page):
        import feedparser
        feed = feedparser.parse(rss_page)
        if feed['bozo'] > 0:
            raise errors.TemporaryError(feed['bozo_exception'])
        else:
            return feed

    def parse_rss_and_check_urls(self):
        url_info_results = {}

        # ya_host, ya_port = self.get_instance_host_and_port()
        # ya_host = 'http://{}.search.yandex.net'.format(ya_host)
        ya_host = self.__country_settings['host_url']
        ya_port = 80

        for rss_page, tdif in self.__country_settings['rss_pages'].iteritems():
            # seconds in 1 day
            max_url_delay = 86400
            feed = self.get_feed(rss_page)
            logging.debug(
                "Feed from=%s; feed=%s;\n",
                rss_page,
                feed,
            )
            if feed['bozo'] > 0:
                raise errors.TemporaryError(feed['bozo_exception'])

            for num, item in enumerate(feed['items']):
                # if num == 3: break  #debug
                url_date_from_rss = item['published_parsed']
                utc_now = calendar.timegm(datetime.datetime.utcnow().timetuple())
                ts_item = calendar.timegm(url_date_from_rss) + tdif*3600

                url_delay = int((utc_now - ts_item))

                original_url = item['link'].encode("utf-8")

                try:
                    fresh_url = self.get_fresh_url(original_url)
                except:
                    # TODO написать сколько я не могу прокачать
                    continue

                url_title = item['title']
                if re.match(u'\d\d:\d\d ', url_title):
                    url_title = url_title[5:]
                url_title = str(url_title.encode("utf-8"))

                in_limit = True if url_delay < max_url_delay else False
                url_delay = str(url_delay)

                canonized_url = self.kiwi_url_canonize(fresh_url)

                url_status = self.check_in_yandex(ya_host, ya_port, canonized_url, {
                    'text': u"url:{}".format(canonized_url)
                })

                logging.debug('url_status for %s = %s', canonized_url, url_status)
                query_in_site_status = 'unknown'
                query_status = 'unknown'
                query_no_quotation_status = 'unknown'

                url_info_results[canonized_url] = {
                    'url_title': url_title,
                    'url_delay': url_delay,
                    'url_status': url_status,
                    'query_in_site_status': query_in_site_status,
                    'query_status': query_status,
                    'query_no_quotation_status': query_no_quotation_status,
                    'in_limit': in_limit,
                }

        return url_info_results

    def create_log(self, url_check_stats, resource_path):
        message_to_base = u''
        maximum_age = 0
        problem_documents_count = 0
        average_age = 0
        all_documents_count = 0
        not_found_documents_list = []
        to_wiki = '===Date: {}=== \n #|\n|| URL | Delay(Seconds) ||\n{}\n|#\n'

        sorted_elems = sorted(url_check_stats.iteritems(), key=lambda x: -1*int(x[1]['url_delay']))
        for canonized_url, url_features in sorted_elems:
            new_row = '\t'.join([
                url_features['url_status'],
                url_features['query_in_site_status'],
                url_features['query_status'],
                url_features['query_no_quotation_status'],
                str(url_features['in_limit']),
                url_features['url_delay'],
                canonized_url,
                url_features['url_title'],
            ])
            logging.debug('{}\n'.format(new_row))
            message_to_base += u'{}\n'.format(new_row.decode('utf-8'))

            if url_features['in_limit'] is True:
                all_documents_count += 1
                if url_features['url_status'] == _DOCUMENT_NOT_FOUND:
                    document_delay = int(url_features['url_delay'])
                    not_found_documents_list.append([canonized_url, document_delay])
                    problem_documents_count += 1
                    average_age += document_delay
                    if document_delay > maximum_age:
                        maximum_age = document_delay

        if problem_documents_count != 0:
            average_age /= problem_documents_count

        self.send_stats_to_solomon(maximum_age, problem_documents_count, average_age, all_documents_count)

        with open(resource_path, 'w') as base_ru:
            base_ru.write(message_to_base.encode("utf-8"))

        wiki_table_content = ''
        for url, delay in not_found_documents_list:
            wiki_table_content += '\n|| {} | {} ||\n'.format(url, delay)

        localtime = time.asctime(time.localtime(time.time()))

        data_to_wiki = to_wiki.format(localtime, wiki_table_content)
        logging.debug('data_to_wiki %s', data_to_wiki)
        self._update_wiki_page(data_to_wiki.decode("utf-8"))

    def on_execute(self):

        self.__country = self.ctx[Country.name]
        self.__sita_host = self.ctx[SitaHost.name]
        self.__sita_port = self.ctx[SitaPort.name]
        self.__sita_cgi = self.ctx[SitaCGI.name]

        logging.debug(
            "Parameters: country=%s; sita_host=%s; sita_port=%d; sita_cgi=%s",
            self.__country, self.__sita_host, self.__sita_port, self.__sita_cgi
        )

        if self.__country == 'TR':
            self.__country_settings = {
                'rss_pages': {
                    'http://rss.feedsportal.com/c/32727/f/510259/index.rss': 0,
                    'http://www.sabah.com.tr/rss/sondakika.xml': -3
                },
                'cgi_parameters': {
                    'lr': '11508',
                    'uil': 'tr',
                },
                'wiki_page': _WIKI_PAGE_TR,
                'host_url': 'http://yandex.com.tr',
            }

        elif self.__country == 'RU':
            self.__country_settings = {
                'rss_pages': {
                    'http://lenta.ru/rss': 0,
                    'http://ria.ru/export/rss2/index.xml': 0
                },
                'cgi_parameters': {
                    'lr': '213',
                },
                'wiki_page': _WIKI_PAGE_RU,
                'host_url': 'http://yandex.net',
            }
        else:
            eh.fail('Country {} is not supported yet'.format(self.__country))

        url_check_stats = self.parse_rss_and_check_urls()
        self.create_log(url_check_stats, channel.sandbox.get_resource(self.ctx["base_ru_res_id"]).path)


__Task__ = CheckFreshDocuments
