# -*- coding: utf-8 -*-
from __future__ import unicode_literals

import cgi
import collections
import datetime
import gzip
import json
import logging
import os
import time
import jinja2
from six.moves import cStringIO
from six.moves import filterfalse
from six.moves.urllib import parse

from sandbox import sdk2
from sandbox.common import rest
from sandbox.common.types.task import Status
from sandbox.common.types.task import TaskStatus

from sandbox.projects import resource_types
from sandbox.projects.common import decorators
from sandbox.projects.common import error_handlers as eh
from sandbox.projects.common import file_utils as fu
from sandbox.projects.common import templates
from sandbox.projects.common import requests_wrapper
from sandbox.projects.common import utils
from sandbox.projects.common.search.response import cgi as sb_cgi
from sandbox.projects.release_machine import security as rm_sec


FINDURL_OWNERS = [
    "mvel",
    "lebedev-aa",
]


ADDITIONAL_CGIS = '&dump_groupings=yes&json_dump=1'

COLOR_PALETTE = ['FFFFFF'] + [
    '000099', '990000', '009900', '019998', '994698', '997566', '909592', '007699', '759900', '648266', '988900',
    '782772', '854900', '990056', '442400', '004878', '683735', '263400', '577393', '005917', '980088', '001544',
    '628699', '997443', '017099', '004754', '856998', '788231', '084641', '917065', '589970', '968488', '558800',
    '430026', '789974', '009966', '998502', '620800', '008996', '989952', '754451', '005939', '656882', '594748',
    '475740', '459972', '995167', '009599', '885858'
] * 5

RELEVANCES = {
    '': 10,
    'VITAL': 8,
    'USEFUL': 7,
    'RELEVANT_PLUS': 6,
    'RELEVANT_MINUS': 5,
    'IRRELEVANT': 4,
    'SPAM': 3,
    'STUPID': 2,
    'VIRUS': 1,
    '_404': 0,
    'None': -1,
    None: -1,
}

MINIMAL_RELEVANCE = 'RELEVANT_PLUS'

RELEVANCES_SHORT = collections.defaultdict(lambda: 'UNK')
RELEVANCES_SHORT.update({
    'VITAL': 'VITL',
    'USEFUL': 'USFL',
    'RELEVANT_PLUS': 'R+',
    'RELEVANT_MINUS': 'R-',
    'IRRELEVANT': 'IRRL',
    'SPAM': 'SPAM',
    'STUPID': 'STPD',
    'VIRUS': 'VIRS',
    '_404': '_404',
})

DEFAULT_METRIC = collections.defaultdict(lambda: 'pfound')
DEFAULT_METRIC['MOBILE'] = 'serp-mobile-five-cg-5'

BAD_SERPSET_IDS = [0, -1, '-1', '0', 'null', 'None', None]
DEFAULT_DIFF_THRESHOLD = -1e-2

FINDURL_BACKEND = 'https://findurl.z.yandex-team.ru/'


class FindUrl(object):

    class Task(object):

        class State(object):
            not_started_yet = "Task is not started yet"
            not_needed = "Task is not needed, no lost documents found"
            failed_to_start = "Task failed to start"
            started = "Task started"
            requests_failed = "Request to FindUrl failed"
            bad_reply = "FindUrl returned incorrect reply"
            done = "Task is DONE. Some documents are NOT in TOP10"
            done_all_in_top10 = "Task is DONE. All documents are in TOP10"
            working = "Task is WORKING"
            failed = "Task is FAILED"
            stopped = "Task is STOPPED"
            unrecognized = "Status returned from FindUrl is not recognized"
            fatal = 'Fatal error. Check <a href={} target="_blank">logs</a> to see whats happened'
            null_serpset_id = "Skipped. Metrics had some problems and hasn't return serpset id"

        def __init__(self, status=None, task_id=None, documents=None, name='', metrics_url=None):
            self.status = status
            self.task_id = task_id
            self.documents = documents or []
            self.name = name
            self.metrics_url = metrics_url
            self.high_diff_queries_amount = 0

        def __str__(self):
            if self.task_id:
                return "Findurl task page: {}/trace/{}/info, status: {}".format(
                    FINDURL_BACKEND, self.task_id, self.status
                )
            else:
                return "Findurl task status: {}".format(self.status)

        def populate_docs_from_findurl_json(self, reply_json):
            for doc_json in reply_json['qurls']:
                for doc_obj in self.documents:
                    if doc_json['query'] == doc_obj.query.text and doc_json['url'] == doc_obj.orig_url:
                        for k, v in doc_json.items():
                            doc_obj.__dict__.setdefault(k, v)
                        doc_obj.task = self

    class Query(object):
        def __init__(self, query_json):
            self.text = query_json['query']['text']
            self.region_id = str(query_json['query']['regionId'])
            self.diff_value = '%.4f' % query_json['diff']
            self.uid = query_json['query']['uid']
            self.device = query_json['query']['device']
            self.country = query_json['query'].get('country')
            self.metrics_diff = None
            self.reference_docs = []
            self.checked_docs = []
            self.reference_metrics_docs = []
            self.checked_metrics_docs = []
            self.lost_documents_amount = 0

        def set_diff_url(self, regional, evaluation, reference_serpset_id, checked_serpset_id, metric_name=''):
            self.metrics_diff = '<a href="{}" target="_blank">{}</a>'.format(
                'https://metrics.yandex-team.ru/mc/qjudgement?regional={}&evaluation={}&aspect=tcg&'
                'metric={}&serpset={}&serpset={}&serpset-filter=onlySearchResult&serpset-filter=onlySearchResult&'
                'query={}&region-id={}{}&device={}{}'.format(
                    regional,
                    evaluation,
                    metric_name or DEFAULT_METRIC[evaluation],
                    checked_serpset_id,
                    reference_serpset_id,
                    parse.quote(self.text.encode("utf-8")),
                    self.region_id,
                    "&uid={}".format(self.uid.encode("utf-8")) if self.uid else '',
                    self.device.encode("utf-8"),
                    "&country={}".format(self.country) if (not self.uid and self.country) else '',
                ),
                self.diff_value,
            )
            return self

        def set_metrics_docs(self, indexed_serpset, set_reference):
            logging.debug("Looking up url list on query %s", self.text)
            try:
                urls_list = indexed_serpset[self.text]['urls_list']
                logging.debug("Query found, it has these urls: %s", urls_list)
                docs = [FindUrl.Document(self, url) for url in urls_list]
                for doc in docs:
                    doc.set_relevance(indexed_serpset)
                if set_reference:
                    self.reference_metrics_docs = docs
                else:
                    self.checked_metrics_docs = docs
            except KeyError:
                logging.debug('Failed to find query')
            return self

        def recreate_metrics_request(self, metrics_cgis, host_name):
            url = sb_cgi.UrlCgiCustomizer(
                base_url='{}/search/'.format(host_name)
            ).add_text(self.text.encode("utf-8")).add_region(self.region_id)
            return '{}?{}&{}'.format(url.base_url, url.params_to_str(), metrics_cgis)

    class Document(object):
        def __init__(self, query, orig_url, reference_position=None):
            self.query = query
            self.orig_url = ('http:' if orig_url.startswith('//') else '') + orig_url
            self.reference_position = reference_position
            self.relevance = ''
            self.is_relevant = True
            self.color_id = 0
            self.position_change = 0
            self.btw = None
            self.btw_result = None
            is_wizard_or_blender_url(self)

        def set_relevance(self, *indexed_serpsets):
            for indexed_serpset in indexed_serpsets:
                try:
                    urls_to_relevances = indexed_serpset[self.query.text]['urls_to_relevances']
                    try:
                        self.relevance = urls_to_relevances[self.orig_url] or ''
                        if RELEVANCES[self.relevance] < RELEVANCES[MINIMAL_RELEVANCE]:
                            self.is_relevant = False
                    except KeyError:
                        logging.debug('Failed to find url')
                except KeyError:
                    logging.debug('Failed to find query')
            return self

        def __eq__(self, other):
            return self.orig_url.lower() == other.orig_url.lower()

    class BisectTaskWrapper(object):
        def __init__(self, current_sandbox_task, host, query_text, sample_request, doc_position):
            self.current_sandbox_task = current_sandbox_task
            self.host = host  # like "noapache.priemka.yandex.ru"
            self.query_text = query_text
            self.sample_request = sample_request
            self.doc_position = doc_position

        def run(self):
            description = "Searching for commit that caused diff to show in FindUrlBucket {}".format(
                self.current_sandbox_task.id,
            )
            if 'upper' in self.host or 'noapache' in self.host:
                params = {
                    'noapache_beta': self.host,
                    'user_query': self.query_text,
                    'request': self.sample_request,
                    'doc_position': self.doc_position,
                }
                logging.debug("Starting noapache bisect task with following params: %s", params)

                self.task_id = sdk2.Task['BISECT_NOAPACHEUPPER_DIFF_REASON'](
                    self.current_sandbox_task,
                    description=description,
                    create_sub_task=False,
                    **params
                ).enqueue().id

            elif 'middle' in self.host or 'mmeta' in self.host:
                params = {
                    'version': int(self.host.split('-')[2]),
                    'middle_beta': self.host,
                    'user_query': self.query_text,
                    'request': self.sample_request,
                    'doc_position': self.doc_position,
                }
                logging.debug("Starting middle bisect task with following params: %s", params)

                self.task_id = sdk2.Task['BISECT_MIDDLESEARCH_DIFF_REASON'](
                    self.current_sandbox_task,
                    description=description,
                    create_sub_task=False,
                    **params
                ).enqueue().id
            else:
                logging.debug(
                    "%s is specified as host. It doesnt look like middle or upper. Not running bisect", self.host
                )

        @decorators.retries(6, 2, 4)
        def get_result(self):
            task_data = rest.Client().task.read(
                id=self.task_id,
                fields='status,context.bisect_info',
                children=True,
                hidden=True,
                limit=1,
            ).get("items", [{}])[0]
            logging.debug("Task status and bisect info: %s", task_data)

            res = task_data["context.bisect_info"]

            logging.debug("Received result %s", res)
            if res:
                if not res.get("Result", False):
                    res["Result"] = task_data["status"]
                return res
            if any(task_data["status"] in gr for gr in [Status.Group.FINISH, Status.Group.BREAK, [TaskStatus.UNKNOWN]]):
                return {"Result": task_data["status"]}
            raise Exception()  # for decorator to retry

    def __init__(self):
        self.FINDURL_META_TEMPLATE = templates.get_html_template('findurl_meta.html')
        self.QUERY_DIFF_TEMPLATE = templates.get_html_template('query_diff.html')

    def generate_result_site(self, current_sandbox_task, tasks, base_ticket_key):

        tasks = list(tasks)

        findurl_meta_html_resource = resource_types.FINDURL_META_HTML(
            current_sandbox_task, "Meta info", "data", test_attr=1, ttl=30
        )
        resource_data = sdk2.ResourceData(findurl_meta_html_resource)
        resource_data.path.mkdir(0o755, parents=True, exist_ok=True)

        all_queries = list({doc.query for task in tasks for doc in task.documents})
        for i, query in enumerate(all_queries):
            query.html_id = i
            query.betas_responses = 'betas_responses_{}.html'.format(i)
            fu.write_file(
                os.path.join(str(resource_data.path), query.betas_responses),
                self._generate_query_html(query.reference_docs, query.checked_docs, query, 'manual')
            )
            query.metrics_responses = 'metrics_responses_{}.html'.format(i)
            fu.write_file(
                os.path.join(str(resource_data.path), query.metrics_responses),
                self._generate_query_html(query.reference_metrics_docs, query.checked_metrics_docs, query, 'metrics')
            )

        findurl_main_html = self._generate_main_html(tasks, base_ticket_key)
        findurl_main_html_name = "findurl_bucket.html"
        fu.write_file(os.path.join(str(resource_data.path), findurl_main_html_name), findurl_main_html)

        resource_data.ready()

        return '{}/{}'.format(findurl_meta_html_resource.http_proxy, findurl_main_html_name)

    def _generate_query_html(self, reference_docs, checked_docs, query, run_type):
        get_serp_colors_diff(reference_docs, checked_docs)

        checked_headers = ['url', 'relev', 'color_id']
        reference_headers = ['number', 'color_id', 'relev', 'url']

        return jinja2.Template(self.QUERY_DIFF_TEMPLATE).render(
            checked_headers=checked_headers,
            reference_headers=reference_headers,
            checked_docs=checked_docs,
            reference_docs=reference_docs,
            table_length=max(len(reference_docs), len(checked_docs)),
            query=query,
            COLOR_PALETTE=COLOR_PALETTE,
            run_type=run_type,
            RELEVANCES_SHORT=RELEVANCES_SHORT,
        ).encode('utf-8')

    def _generate_main_html(self, tasks, base_ticket_key):
        headers = [
            'lr',
            'orig_query',
            'orig_url',
            'relevance',
            'serpset_diff',
            'serp_pos',
            'pos',
            'status',
            'info',
            'req_ids',
        ]
        if any(getattr(task, 'bisected', False) for task in tasks):
            headers.append('commit')
        hyphenated = {'query', 'url', 'serpset_diff', 'status', 'info', 'commit', 'req_ids'}
        centered = {'relev', 'serp_pos', 'pos', 'region'}
        tooltiped = {'query', 'url', 'info', 'status'}

        return jinja2.Template(self.FINDURL_META_TEMPLATE).render(
            headers=headers,
            hyphenated=hyphenated,
            centered=centered,
            tasks=tasks,
            tooltiped=tooltiped,
            current_time=datetime.datetime.now(),
            base_ticket_key=base_ticket_key,
            RELEVANCES_SHORT=RELEVANCES_SHORT,
            DEFAULT_DIFF_THRESHOLD=DEFAULT_DIFF_THRESHOLD,
            _cgi=cgi,
        ).encode('utf-8')

    @staticmethod
    def compare_searches(
        metrics_json_response,
        max_missing_docs_per_basket=20,
        max_queries_per_basket=50,
        run_bisect=False,
        current_sandbox_task=None,
    ):
        """
            Метод для помощи в понимании причин различий новой беты поиска и референтной. Сравнение происходит на
             нескольких корзинах запросов. Топ запросов из каждой корзины с макимальным ухудшением дефолтной метрики
             'вручную' переотправляется на беты. Документы присутсутвующие в выдаче референтной беты,
             но отсутствующие в новой, называются потерянными. Если они имеют достаточно высокую релеватность
             (или не размечены) то они считаются 'интересными' и отправляются в FindUrl. Метод в течении определённого
             времени дожидается выполнения FindUrl-задач и в качестве результата выдаёт их имена (взяты из названия
             корзин запросов), url'ы и статусы.
            :param metrics_json_response: JSON, возвращаемый metrics launcher содержащий результаты вылолнения
            :param max_missing_docs_per_basket: Максимальное количество документов, отправляемых в FindUrl
                за одну корзину, 0 - неограниченно
            :param max_queries_per_basket: Ограничение на количество проверяемых запросов из одной корзины,
                0 - неограниченно
            :param run_bisect: Нужно ли запускать бинпоиск коммита повлиявшего на изменения
            :param current_sandbox_task: Текущий sb таск
            :return: list объектов класса Task
        """
        tasks = []
        checked_queries = set()
        token = rm_sec.get_rm_token(current_sandbox_task)
        for launch in metrics_json_response['launches']:
            regional_type = launch['regionalType']
            evaluation_type = launch['evaluationType']
            for query_group in launch['diffQueryGroups']:
                reference_host = query_group['baselineServer']['host']
                checked_host = query_group['server']['host']
                reference_serpset_id = query_group['baselineSerpsetId']
                checked_serpset_id = query_group['serpsetId']
                queries_group_name = ' '.join((query_group['name'], query_group['filterName']))
                if reference_serpset_id in BAD_SERPSET_IDS or checked_serpset_id in BAD_SERPSET_IDS:
                    tasks.append(FindUrl.Task(FindUrl.Task.State.null_serpset_id))
                    continue
                try:
                    indexed_reference_serpset = index_serpset(get_serpset_from_metrics(reference_serpset_id, token))
                    indexed_checked_serpset = index_serpset(get_serpset_from_metrics(checked_serpset_id, token))

                    queries = get_top_queries_from_serpsets_diff(
                        reference_serpset_id, checked_serpset_id, regional_type, evaluation_type, token
                    )
                    queries = [q for q in queries if q.text not in checked_queries]
                    high_diff_queries_amount = len(queries)
                    queries = queries[:max_queries_per_basket or None]
                    queries = (
                        q.set_diff_url(regional_type, evaluation_type, reference_serpset_id, checked_serpset_id)
                        for q in queries
                    )
                    queries = (q.set_metrics_docs(indexed_reference_serpset, set_reference=True) for q in queries)
                    queries = (q.set_metrics_docs(indexed_checked_serpset, set_reference=False) for q in queries)
                    queries = list(queries)

                    reference_metrics_cgis = get_cgis_from_metrics(reference_serpset_id, token)
                    checked_metrics_cgis = get_cgis_from_metrics(checked_serpset_id, token)

                    missing_documents = list(find_missing_documents(
                        reference_metrics_cgis,
                        checked_metrics_cgis,
                        reference_host,
                        checked_host,
                        queries,
                    ))
                    queries_with_lost_docs = len(queries) - [query.lost_documents_amount for query in queries].count(0)

                    for query in queries:
                        query.reference_metrics_request = query.recreate_metrics_request(
                            reference_metrics_cgis, reference_host
                        )
                        query.checked_metrics_request = query.recreate_metrics_request(
                            checked_metrics_cgis, checked_host
                        )
                        for doc in query.checked_docs:
                            doc.set_relevance(indexed_reference_serpset, indexed_checked_serpset)
                        for doc in query.reference_docs:
                            doc.set_relevance(indexed_reference_serpset, indexed_checked_serpset)

                    missing_documents = filterfalse(is_wizard_or_blender_url, missing_documents)
                    missing_documents = [doc for doc in missing_documents if doc.is_relevant]
                    lost_relevant_docs_amount = len(missing_documents)
                    missing_documents = missing_documents[:max_missing_docs_per_basket or None]

                    task = FindUrl.run_batch_task(missing_documents, checked_host, checked_metrics_cgis)
                    if run_bisect:
                        task.bisected = True
                        for doc in missing_documents[:2]:
                            logging.debug("Starting bisect for query %s", doc.query.text)
                            doc.btw = FindUrl.BisectTaskWrapper(
                                current_sandbox_task,
                                checked_host,
                                doc.query.text,
                                doc.query.checked_request,
                                doc.reference_position,
                            )
                            doc.btw.run()

                    task.max_queries_per_basket = max_queries_per_basket
                    task.high_diff_queries_amount = high_diff_queries_amount
                    task.lost_relevant_docs_amount = lost_relevant_docs_amount
                    task.queries_with_lost_docs = queries_with_lost_docs
                    task.max_missing_docs_per_basket = max_missing_docs_per_basket

                    checked_queries.update(doc.query.text for doc in missing_documents)
                    logging.debug(checked_queries)
                except Exception as e:
                    eh.log_exception("Caught unexpected exception while gathering data or starting findurl task", e)
                    task = FindUrl.Task(FindUrl.Task.State.fatal.format(utils.get_task_log(current_sandbox_task)))
                task.name = queries_group_name
                task.metrics_url = (
                    'https://metrics.yandex-team.ru/mc/queries?regional={}&evaluation={}&aspect=tcg'
                    '&metric={}&serpset={}&serpset={}&serpset-filter=onlySearchResult'
                    '&serpset-filter=onlySearchResult&page-size=100&sort-field=diff&'
                    'sort-direction=asc'.format(
                        regional_type,
                        evaluation_type,
                        DEFAULT_METRIC[evaluation_type],
                        checked_serpset_id,
                        reference_serpset_id,
                    ))
                tasks.append(task)
        for task in sorted(tasks, key=lambda t: t.status == FindUrl.Task.State.not_needed):
            try:
                FindUrl.wait_task(task)
            except Exception as e:
                eh.log_exception("Caught unexpected exception while checking findurl task status", e)
                task = FindUrl.Task(FindUrl.Task.State.fatal.format(utils.get_task_log(current_sandbox_task)))
            for doc in task.documents:
                if doc.btw:
                    try:
                        doc.btw_result = doc.btw.get_result()
                    except Exception as e:
                        eh.log_exception("Caught unexpected exception while waiting bisect task completion", e)
                        doc.btw_result = {"Result": "Timeout"}
                    logging.debug("doc.btw_result %s", doc.btw_result)
            yield task

    @staticmethod
    def run_batch_task(documents, hostname, main_cgis, findurl_backend=FINDURL_BACKEND):
        """
            Запустить findurl батч таск
            :param documents: list объектов класса Document
            :param hostname: Имя хоста, например 'base.hamster.yandex.ru'
            :param main_cgis: CGI поискового запроса которые будут переданы findurl
            :param findurl_backend: url of findurl API
            :return: list объектов класса Task
        """
        findurl_qurls = [
            {
                "query": doc.query.text,
                "url": doc.orig_url,
                "region": str(doc.query.region_id)
            } for doc in documents
        ]
        if not findurl_qurls:
            return FindUrl.Task(FindUrl.Task.State.not_needed)
        logging.debug("Starting Findurl task")
        token = sdk2.Vault.data('findurl_token')
        headers = {"Authorization": "OAuth {}".format(token)}
        initial_request = requests_wrapper.post(
            "{}api/findurl.Findurl/postTask".format(findurl_backend),
            headers=headers,
            json={
                "domain": hostname,
                "cgi": parse.quote(main_cgis + ADDITIONAL_CGIS),
                "description": "Launch metrics launch",
                "service_name": "WEB",
                "qurls": findurl_qurls
            },
        )
        try:
            reply = initial_request.json()
            # reply example: {'id': "73ed1a9a-dbd9-4eac-aaf0-027c89152f06"}
            task_id = reply["id"]
            logging.debug("Started Findurl task: %s", task_id)
            return FindUrl.Task(FindUrl.Task.State.started, task_id, documents)
        except Exception:
            logging.debug(
                "Failed to start Findurl task, replied with code: '%s', text: '%s'",
                initial_request.status_code,
                initial_request.text
            )
            return FindUrl.Task(FindUrl.Task.State.failed_to_start)

    @staticmethod
    def wait_task(task, minutes_to_wait=5):
        """
            Попробовать дождаться выполнения FindUrl таска
            :param task: объект класса Task
            :param minutes_to_wait: Количество минут ожидания
        """
        for _ in range(minutes_to_wait):
            try:
                logging.debug('get task status, task_id=%s', task.task_id)
                req = requests_wrapper.post(
                    '{}api/findurl.Findurl/getTask'.format(FINDURL_BACKEND), json={'id': task.task_id}
                )
            except Exception as e:
                task.status = FindUrl.Task.State.requests_failed
                eh.log_exception("Request to findurl failed with exception", e)
                continue
            else:
                if req.status_code != 200:
                    task.status = FindUrl.Task.State.requests_failed
                    logging.debug(
                        'Request to findurl failed with %s code, response text:\n%s',
                        req.status_code, req.text
                    )
                    continue
            reply_json = req.json()
            logging.debug('Findurl answered with %s status code, response json:\n%s', req.status_code, reply_json)
            if reply_json['task'].get('status', None) is None:
                task.status = FindUrl.Task.State.bad_reply
                logging.debug('There is no "status" field in Findurl reply')
            else:
                logging.debug('Findurl task status is %s', reply_json['task']['status'])
                if reply_json['task']['status'] in ('SUCCESS', 'PARTIAL_SUCCESS'):
                    for qurl in reply_json['qurls']:
                        if qurl['status'] != 'TOP10':
                            task.status = FindUrl.Task.State.done
                            task.populate_docs_from_findurl_json(reply_json)
                            break
                    else:
                        task.status = FindUrl.Task.State.done_all_in_top10
                        task.populate_docs_from_findurl_json(reply_json)
                    break
                elif reply_json['task']['status'].endswith('ERROR'):
                    task.status = FindUrl.Task.State.failed
                    break
                elif reply_json['task']['status'] == 'STOPPED':
                    task.status = FindUrl.Task.State.stopped
                    break
                else:
                    logging.debug("FindUrl task status '%s' wasn't recognized" % reply_json['task']['status'])
                    task.status = FindUrl.Task.State.working
            time.sleep(60)
        else:
            logging.debug('All retries exhausted. There are some problems or task is taking too long')
        """
        example of successful reply from findurl:
        {
            'progress': 100,
            'result': [
            {
                'info':
                    'PASSED,SEARCH,MERGE:0.0,RANKING_MERGE:0.0,REARR_AFTER_MERGE:0.0,'
                    'AFTER_FETCH:0.0,REARR_AFTER_FETCH:0.0',
                'lr': '213',
                'mark': '',
                'orig_query': 'vk',
                'orig_url': 'vk.com',
                'pos': '1',
                'query': '',
                'serp_pos': '1',
                'status': 'TOP10',
                'substatus': '',
                'url': 'vk.com'
            },
            {
                'info':
                    'PASSED,SEARCH,MERGE:0.0,RANKING_MERGE:0.0,REARR_AFTER_MERGE:0.0,'
                    'AFTER_FETCH:0.0,REARR_AFTER_FETCH:0.0',
                'lr': '213',
                'mark': '',
                'orig_query': 'yandex',
                'orig_url': 'yandex.ru',
                'pos': '1',
                'query': '',
                'serp_pos': '1',
                'status': 'TOP10',
                'substatus': '',
                'url': 'yandex.ru'}],
                'status': 'DONE',
                'taskid': 'any-20170428_191835',
                'taskurl': 'https://findurl.n.yandex-team.ru/?taskid=any-20170428_191835'
            }
        """
        logging.debug(task)


def find_missing_documents(
        reference_metrics_cgis,
        checked_metrics_cgis,
        reference_host_name,
        checked_host_name,
        queries,
        unanswer_threshold=100
):
    """
        Найти документы, которые есть в выдаче от reference_host_name, но нет в выдаче от checked_host_name
        :param reference_metrics_cgis: CGI для референсного запроса с metrics
        :param checked_metrics_cgis: CGI для проверяемого запроса с metrics
        :param reference_host_name: Имя референсного хоста
        :param checked_host_name: Имя проверяемого хоста
        :param queries: list объектов класса Query
        :param unanswer_threshold: Число неответов, при котором будет выдана ошибка
        :return: list объектов класса Document
    """
    unanswer_counter = 0
    for i, query in enumerate(queries, 1):
        query.reference_host_name, query.checked_host_name = reference_host_name, checked_host_name
        if unanswer_counter >= unanswer_threshold:
            raise Exception("Had more than {} failures, stopping comparison".format(unanswer_threshold))

        logging.debug(
            'Request #%s: text "%s", region "%s", metric diff "%s"',
            i, query.text, query.region_id, query.diff_value,
        )

        query.reference_req_id, query.reference_docs, query.reference_request = make_requests(
            reference_metrics_cgis, reference_host_name, query
        )
        query.checked_req_id, query.checked_docs, query.checked_request = make_requests(
            checked_metrics_cgis, checked_host_name, query
        )

        problem = 'sample beta unanswer' if (not query.reference_docs and query.checked_docs) else None
        problem = 'checked beta unanswer' if (query.reference_docs and not query.checked_docs) else problem
        problem = 'both betas unanswer' if (not query.reference_docs and not query.checked_docs) else problem

        if problem:
            diff_entry = {
                'query': query.text,
                'region': query.region_id,
                'reference_req_id': query.reference_req_id,
                'reference_docs': query.reference_docs,
                'checked_req_id': query.checked_req_id,
                'checked_docs': query.checked_docs,
                'problem': problem,
            }
            logging.debug('Problem detected: %s, diff entry\n%s', problem, diff_entry)
            unanswer_counter += 1
        else:
            for doc in compare_search_results(query.reference_docs, query.checked_docs):
                query.lost_documents_amount += 1
                yield doc


def is_wizard_or_blender_url(doc):
    wb_to_substrs = {
        'wizard': ('newswizard', 'imageswizard'),
        'blender': ('videoblend', 'geo_wizard'),
    }
    for wb, substrs in wb_to_substrs.items():
        for substr in substrs:
            if substr in doc.orig_url:
                logging.debug('This looks like wizard or blender url: ' + doc.orig_url)
                doc.type = wb
                return True
    return False


def gen_headers_from_token(token):
    return {
        "Content-Type": "application/json;charset=UTF-8",
        "Authorization": "OAuth {}".format(token),
    }


def get_top_queries_from_serpsets_diff(
    reference_serpset_id,
    checked_serpset_id,
    regional_type,
    evaluation_type,
    token,
    threshold=DEFAULT_DIFF_THRESHOLD,
    metric_name='',
    positive=False,
):
    """
        Получить запросы с макисмальным ухудшением метрики при сравнении двух серпсетов
        :param reference_serpset_id: id референсного серпсета в metrics
        :param checked_serpset_id: id проверяемого серпсета в metrics
        :param regional_type: регион, например 'RU'
        :param evaluation_type: тип, например 'WEB' или 'MOBILE'
        :param token: oauth token для доступа к metrics
        :param threshold: запросы беруться если diff меньше threshold
        :param metric_name: имя метрики по которой производится сортировка
        :param positive: получать запросы с максимальным улучшением метрики вместо ухудшения
        :return: list объектов класса Query
    """
    if positive:
        sort_direction = 'desc'
        threshold *= -1
    else:
        sort_direction = 'asc'
    try:
        url = (
            "https://metrics-join.qe.yandex-team.ru/api/qex/metric-by-queries?regional={}&evaluation={}&" +
            "metric={}&left-serp-set={}&right-serp-set={}&sort-field=diff&sort-direction={}"
        ).format(
            regional_type,
            evaluation_type,
            metric_name or DEFAULT_METRIC[evaluation_type],
            checked_serpset_id,
            reference_serpset_id,
            sort_direction,
        )
        logging.debug("Get top queries from serpsets diff by url: %s", url)

        serpsets_diff = requests_wrapper.get_r(
            url,
            timeout=60,
            headers=gen_headers_from_token(token),
        )
        serpsets_diff_json = serpsets_diff.json()
    except Exception as e:
        eh.log_exception("Failed to get top queries from metrics", e)
        return
    for query_json in serpsets_diff_json['calculatedQueries']:
        if positive and query_json['diff'] < threshold:
            break
        if not positive and query_json['diff'] > threshold:
            break
        yield FindUrl.Query(query_json)


@decorators.retries(max_tries=3, delay=5, backoff=3)
def get_serpset_from_metrics(serpset_id, token):
    logging.debug('Getting serpset with id %s', serpset_id)
    gzipped = requests_wrapper.get(
        "https://metrics-calculation.qloud.yandex-team.ru/api/json/" + str(serpset_id),
        timeout=120,
        headers=gen_headers_from_token(token),
    )
    logging.debug(gzipped.content[:200])
    return json.load(gzip.GzipFile(fileobj=cStringIO(gzipped.content)))


def get_cgis_from_metrics(serpset_id, token, remove_list=None):
    logging.debug('Getting cgis from serpset with id %s', serpset_id)
    if remove_list is None:
        remove_list = ['lr', 'text']
    resp = requests_wrapper.get_r(
        'https://metrics.yandex-team.ru/api-mc/json/{}?strict=true&requirement=SERP.headers'.format(serpset_id),
        verify=False,
        timeout=60,
        headers=gen_headers_from_token(token),
    )
    j = json.load(gzip.GzipFile(fileobj=cStringIO(resp.content)))
    cgis = parse.urlparse(j[0]['headers']['url'])[4]
    cgis_dict = parse.parse_qs(str(cgis))
    for name in remove_list:
        cgis_dict.pop(name, None)
    return parse.urlencode(cgis_dict, doseq=True)


def index_serpset(raw_serpset):
    result = {}
    for serp in raw_serpset:
        urls_to_relevance = {}
        if not serp.get('components'):
            logging.debug("There are no 'components' field in serp!")
            continue
        for component in serp['components']:
            relevance = component['judgements.RELEVANCE']['name'] if component.get('judgements.RELEVANCE') else None
            urls_to_relevance[component['componentUrl']['pageUrl']] = relevance
        result[serp['query']['text']] = {
            'urls_to_relevances': urls_to_relevance,
            'urls_list': [component['componentUrl']['pageUrl'] for component in serp['components']],
        }
    return result


def make_requests(metrics_cgis, host_name, query):
    url = query.recreate_metrics_request(metrics_cgis, host_name) + ADDITIONAL_CGIS
    try:
        req_id, doc_urls, constructed_url = get_json_response(url, dump_failed=False)
        return req_id, [FindUrl.Document(query, doc_url) for doc_url in doc_urls], constructed_url
    except Exception as e:
        eh.log_exception("Request failed", e)
        return None, [], ''


def get_json_response(iter_url, dump_failed=True):
    req = requests_wrapper.get_r(iter_url)
    url = req.request.url
    json_data = req.json()
    json_reqid = json_data['tmpl_data']['reqdata']['reqid']
    json_docs = [i['url'] for i in json_data['tmpl_data']['searchdata']['docs']]

    if (not json_reqid or not json_docs) and dump_failed:
        logging.debug(
            'JSON response:\n%s\n--------------------------------\n',
            json.dumps(json_data, indent=2),
        )

    logging.debug('JSON reqid: %s, JSON docs: %s', json_reqid, json_docs)
    return json_reqid, json_docs, url


def compare_search_results(reference_docs, checked_docs):
    if reference_docs == checked_docs:
        logging.debug('Serps are exactly the same')
        return
    else:
        checked_docs_urls_set = set(doc.orig_url for doc in checked_docs)
        reference_docs_urls_set = set(doc.orig_url for doc in reference_docs)
        if checked_docs_urls_set == reference_docs_urls_set:
            logging.debug('Serps contain the same documents, but reshuffled')
        else:
            logging.debug('Some documents was lost, some was added')
        for i, doc in enumerate(reference_docs, 1):
            if doc.orig_url not in checked_docs_urls_set:
                doc.reference_position = i
                yield doc


def get_serp_colors_diff(old, new):
    new_set = set(doc.orig_url for doc in new)
    color = 1
    for i, doc in enumerate(old):
        if len(new) > i and doc == new[i]:
            logging.debug("#%s is same as (%s)", i + 1, doc)
            doc.color_id = color
            new[i].color_id = color
            color += 1
        else:
            if doc.orig_url in new_set:
                j = new.index(doc)
                logging.debug("#%s went to %s position (%s)", i + 1, j + 1, doc)
                doc.color_id = color
                doc.position_change = i - j
                new[j].color_id = color
                new[j].position_change = j - i
                color += 1
            else:
                logging.debug("#%s is gone (%s)", i + 1, doc)
