# -*- coding: utf-8 -*-

import re
import six
import copy
import time
import datetime
import xml.dom.minidom
import json
import traceback
import logging

import sandbox.projects.websearch.middlesearch.resources as ms_resources
from sandbox.projects import resource_types
from sandbox.projects.common import file_utils as fu
from sandbox.projects.common import error_handlers as eh
from sandbox.projects.common import utils
from sandbox.projects.common import utils2
from sandbox.projects.common.search import metadebug
from sandbox.projects.common.search import queries as sq
from sandbox.projects.common.search.components import DefaultMiddlesearchParams as dmp
from sandbox.projects.common.search.response import patterns as rp
from sandbox.projects.common.search.response import cgi as response_cgi

from . import threadPool
from . import basesearch_response_parser as brp
from . import node_types
from .tree import htmldiff

from sandbox.common import errors as common_errors
from sandbox.sandboxsdk import paths
from sandbox.sandboxsdk.channel import channel
from sandbox.sandboxsdk import parameters as sp


_GROUP = 'Params for info requests testing'
_DEBUG = 'Debug'

_IGNORE = "__IGNORE__"


class QueriesParameter(sp.ResourceSelector):
    name = 'queries_resource_id'
    description = 'Source queries'
    group = _GROUP
    resource_type = [
        resource_types.PLAIN_TEXT_QUERIES,
        ms_resources.WebMiddlesearchPlainTextQueries,
        resource_types.IMAGES_MIDDLESEARCH_PLAIN_TEXT_REQUESTS,
        resource_types.VIDEO_MIDDLESEARCH_PLAIN_TEXT_REQUESTS,
    ]
    required = True


class MaxQueries(sp.SandboxIntegerParameter):
    name = 'max_queries'
    description = 'Max queries'
    group = _GROUP
    default_value = 100


class GetRfForRemainingQueries(sp.SandboxBoolParameter):
    name = 'get_rf_for_remaining_queries'
    description = 'Get rf for remaining queries'
    group = _GROUP
    default_value = True


class MaxRfQueries(sp.SandboxIntegerParameter):
    name = 'max_rf_queries'
    description = 'Max rf queries'
    group = _GROUP
    default_value = 0


class RequestType(sp.SandboxStringParameter):
    name = 'request_type'
    description = 'Request type'
    group = _GROUP
    required = False


class SaveSearchResponses(sp.SandboxBoolParameter):
    name = 'save_search_responses'
    description = 'Save search responses in case of errors'
    group = _DEBUG
    default_value = False


class SavePatchedQueries(sp.SandboxBoolParameter):
    name = 'save_patched_queries'
    description = 'Save patched queries'
    group = _DEBUG
    default_value = False


class DisableCache(sp.SandboxBoolParameter):
    name = 'disable_cache'
    description = 'Disable cache (see SEARCH-1304)'
    group = _DEBUG
    default_value = False


PARAMS = (
    (
        QueriesParameter,
        MaxQueries,
        GetRfForRemainingQueries,
        MaxRfQueries,
        RequestType,
        # debug options
        SaveSearchResponses,
        SavePatchedQueries,
        DisableCache,
    ) +
    metadebug.PARAMS.params +
    threadPool.PARAMS
)


def test_info_requests(search_component, ignore_errors_for_types=None):
    search_component.start()
    search_component.wait()
    search_component.warmup_request()  # Defeat timeout problems, like in SEARCH-1441
    _test_info_requests_impl(search_component, ignore_errors_for_types)
    search_component.stop()


def _test_info_requests_impl(search_component, ignore_errors_for_types):
    base_url = "http://localhost:{}/yandsearch".format(search_component.port)

    ctx = channel.task.ctx
    queries, only_rf_queries = _get_queries(ctx)

    start_time = time.time()

    common_params = Params(
        base_url=base_url,
        request_type=None,
        save_search_responses=utils.get_or_default(ctx, SaveSearchResponses),
        save_patched_queries=utils.get_or_default(ctx, SavePatchedQueries),
        dump_subsource_answer=metadebug.dump_subsource_answer(ctx),
        disable_cache=utils.get_or_default(ctx, DisableCache),
        is_int_component=utils.get_or_default(ctx, dmp.UseInt),
    )

    def run(list_to_process, request_type):
        params = copy.deepcopy(common_params)
        params.request_type = request_type
        return search_component.use_component(
            lambda: threadPool.process_data(
                func=_thread_func,
                list_to_process=list_to_process,
                params=params,
                ctx=ctx,
            )
        )

    results = run(queries, ctx[RequestType.name])
    logging.info("processing time: %s", str(datetime.timedelta(seconds=(time.time() - start_time))))

    if only_rf_queries:
        logging.info("starting threads for only_rf_queries")
        start_time = time.time()
        only_rf_results = run(only_rf_queries, "rf")
        results.extend(only_rf_results)
        logging.info("processing time: %s", str(datetime.timedelta(seconds=(time.time() - start_time))))

    result_stat = _process_results(results, ctx[SaveSearchResponses.name])

    _check_results(result_stat, ignore_errors_for_types)


def _get_queries(ctx):
    queries_path = channel.task.sync_resource(ctx[QueriesParameter.name])
    original_queries = fu.read_lines(queries_path)

    queries = original_queries
    only_rf_queries = []

    max_queries = utils.get_or_default(ctx, MaxQueries)

    if max_queries != 0:
        queries = original_queries[:max_queries]

        if utils.get_or_default(ctx, GetRfForRemainingQueries):
            only_rf_queries = original_queries[max_queries:]

    max_rf_queries = utils.get_or_default(ctx, MaxRfQueries)
    if max_rf_queries != 0:
        only_rf_queries = only_rf_queries[:max_rf_queries]

    return queries, only_rf_queries


def _check_results(stat, ignore_errors_for_types):
    errors = []

    for request_type, s in stat.iteritems():
        if s['error']:
            if ignore_errors_for_types and request_type in ignore_errors_for_types:
                continue
            errors.append(request_type)

    if errors:
        eh.check_failed(
            'Errors with info-requests: {}\n'
            'See INFO_REQUESTS_TEST_LOGS resource for more info'.format(','.join(errors))
        )


def _process_results(results, save_search_responses, write_logs=True):
    # get common statistics
    stat = {}
    errors = False

    for result in results:
        if result['request_type'] not in stat:
            stat[result['request_type']] = {'ok': 0, 'error': 0}

        if result['passed']:
            stat[result['request_type']]['ok'] += 1
        else:
            stat[result['request_type']]['error'] += 1
            errors = True

    if write_logs:
        # write results to the log file
        for request_type, value in stat.items():
            stat_string = 'Type: {} Ok: {} Error: {}'.format(request_type, value['ok'], value['error'])
            logging.info(stat_string)

        if errors:
            _write_results_to_file(results, save_search_responses)

    return stat


def _write_results_to_file(results, save_search_responses):
    task = channel.task
    errors_resource = task.create_resource(
        'Info requests errors',
        'info_requests_errors.html',
        resource_types.INFO_REQUESTS_TEST_LOGS,
    )

    # save one of bad request to recheck
    bad_requests_resource = channel.task.create_resource(
        'One bad request sample',
        'one_bad_request.txt',
        resource_types.PLAIN_TEXT_QUERIES,
    )

    errors_file = open(errors_resource.path, "w")
    bad_requests_file = open(bad_requests_resource.path, "w")

    def write_body():
        bad_query_example = False
        for result in results:
            if result['passed']:
                # skip good results
                continue

            if not bad_query_example:
                # write only one query to test
                original_query = re.sub(r'&info=.*', '', result['info_request'])
                bad_requests_file.write(original_query + '\n')
                bad_query_example = True

            errors_file.write(htmldiff.StartBlock("request type: " + result['request_type']))

            htmldiff.WriteDataBlock(errors_file, "request", result['info_request'])
            htmldiff.WriteDataBlock(errors_file, "docid", result['doc'].DocId)
            htmldiff.WriteDataBlock(errors_file, "error", result['error'])
            htmldiff.WriteDataBlock(errors_file, "time", str(result['time']))

            if 'factors' in result:
                search_factors, info_factors = result['factors']
                if search_factors != info_factors:
                    errors_file.write(
                        htmldiff.StartBlock("changed factors (search vs info request)", htmldiff.COLOR_CHANGED)
                    )
                    node_types.write_objects_diff(errors_file, None, ["factor"], search_factors, info_factors, None)
                    errors_file.write(htmldiff.EndBlock())

            if 'info_response' in result:
                htmldiff.WriteDataBlock(errors_file, "info response", result['info_response'])

            if 'search_response' in result and save_search_responses:
                htmldiff.WriteDataBlock(errors_file, "search response", result['search_response'])

            errors_file.write(htmldiff.EndBlock())

    htmldiff.WriteDiff(errors_file, write_body, addLegend=False, addButtons=False, title="errors")
    errors_file.close()
    bad_requests_file.close()

    task.mark_resource_ready(errors_resource)
    task.mark_resource_ready(bad_requests_resource)
    task.set_info("See " + utils2.resource_redirect_link(errors_resource.id, "info requests errors"), do_escape=False)
    task.set_info(
        "Sample query: " + utils2.resource_redirect_link(bad_requests_resource.id, "bad query"),
        do_escape=False,
    )


class DummyChecker:
    @staticmethod
    def parse(info_response):
        return json.loads(info_response)

    @staticmethod
    def check(response):
        pass


class CGISepChecker(DummyChecker):
    @staticmethod
    def check(response):
        eh.ensure("CGISepParams" in response, "Cannot find element CGISepParams in response")
        eh.ensure(
            response["CGISepParams"].get("type") == "table",
            "CGISepParams type is not table"
        )
        eh.ensure(
            response["CGISepParams"].get("rows"),
            "CGISepParams rows is empty"
        )


class CGIParamsChecker(DummyChecker):
    @staticmethod
    def check(response):
        eh.ensure("CGIParams" in response, "Cannot find element CGIParams in response")
        eh.ensure("data" in response["CGIParams"], "No data in CGIParams")


class DocattrChecker:
    @staticmethod
    def parse(info_response):
        return xml.dom.minidom.parseString(info_response)

    path = ["yandexsearch", "response", "results", "grouping", "group", "doc", "categ"]

    @staticmethod
    def check(response):
        # docattr info response can return only xml
        current = response

        for element_name in DocattrChecker.path:
            items = current.getElementsByTagName(element_name)
            eh.ensure(items, "Cannot find element '{}' in response".format(element_name))
            current = items[0]


class IntComponentChecker:
    """
        Special class for Int component to check response for known errors.
    """

    @staticmethod
    def parse(info_response):
        return info_response

    @staticmethod
    def check(response):
        if "ERROR" in response:
            err_message = response.split("ERROR: ")
            if err_message[1] not in ["(yexception) not found rank model", "(yexception) not found factors storage"]:
                logging.debug("ERROR: " + err_message[1])
                eh.check_failed("Unknown error!")


_REQUEST_TYPES_INT_COMP_TO_SKIP_ = {
    "mnvariance",
    "rankingmodels",
    "matrixnet",
    "rf",
    "matrixnetfactors",
    "fm",
}


def _get_checker_by_req_type(req_type, is_int_component):
    if is_int_component and req_type in _REQUEST_TYPES_INT_COMP_TO_SKIP_:
        return IntComponentChecker
    if req_type == "docattr":
        return DocattrChecker
    elif req_type == "cgisep":
        return CGISepChecker
    elif req_type == "cgiparams":
        return CGIParamsChecker
    else:
        return DummyChecker


_INFO_REQUESTS_TYPES = [
    "rf",
    "frf",
    "matrixnet",
    "matrixnetfactors",
    "fm",
    "erfinfo",
    "herfinfo",
    "textarchive",
    "linkarchive",
    "docattr",
    "docid",
    "topandargs",
    "triterator",
    "quorumparams",
    "trpos",
    "lrpos",
    "annpos",
    "trrelevance",
    "lrrelevance",
    "snippets",
    "snippethits",
    "stamp",
    "version",
    "config",
    "cgiparams",
    "mnvariance",
    "cgisep",
    "rankingmodels",
    "annotations",
    "panthermatch",
    "pantherterms",
]


_INFO_REQUESTS_LENGHTS = {
    "quorumparams": 2 * 1024 * 1024,
    "annotations": 5 * 1024 * 1024,
    # some documents have extremely large text archives
    "textarchive": 0,  # all sizes allowed
    "docattr": 3 * 1024 * 1024,
    "rankingmodels": 3 * 1024 * 1024,
}

_RELEVANCE_REQUEST_TYPES = [
    "rf",
    "fm",
    "topandargs",
    "triterator",
    "quorumparams",
    "trpos",
    "lrpos",
    "annpos",
    "trrelevance",
    "lrrelevance",
]

_META_RELEVANCE_REQUEST_TYPES = [
    "rf",
    "fm",
]


def _get_relevance_from_response(response):
    metadata = response.get('MetaData', None)
    if not metadata:
        eh.check_failed("MetaData not found in info request answer")

    for row in metadata['rows']:
        cells = row["cells"]
        if cells[0] == 'Relevance':
            return cells[1]

    eh.check_failed("relevance not found in info request answer metadata")


def _get_factors_from_info_response(response):
    metadata = response.get('MetaData', None)
    eh.ensure(metadata, "MetaData not found in info request answer")
    eh.ensure('rows' in metadata, "MetaData doesn't contain 'rows'")
    for row in metadata['rows']:
        eh.ensure('cells' in row, "MetaData['row'][...] doesn't contain cells")
        cells = row["cells"]
        eh.ensure(len(cells) > 0, "MetaData['row'][...]['cells'][...]'s len is zero")
        if cells[0] == 'Factors':
            eh.ensure(len(cells) > 1, "MetaData['row'][...]['cells'][...] doesn't contain a value for Factors")
            factors = cells[1].split(" ")
            return [rp.beautify_factor_value(v) for v in factors]

    return None


def _get_info_request_url(search_url, request_type, doc_id):
    return "{}&info={}:docid:{};json".format(search_url, request_type, doc_id)


def _fetch_info_response(info_request_url):
    timeout = 300
    response = None

    for attempt in range(brp.MAX_TIMEOUT_ATTEMPTS):
        try:
            response = brp.fetch_data(
                info_request_url,
                timeout=timeout,
                attempt=attempt,
                ignore_empty_response=True,  # SEARCH-2791
            )
            if not response:
                logging.info(
                    "Got empty response for %s request with %s timeout, retrying [%s]...",
                    info_request_url, timeout, attempt,
                )
                timeout = 60
                continue

        except Exception as exception:
            eh.check_failed(
                "Exception type: {}\n{}".format(type(exception), traceback.format_exc())
            )

        return response

    logging.error("Giving up for %s request after %s attempts", info_request_url, brp.MAX_TIMEOUT_ATTEMPTS)


def _send_info_request_and_parse_response(request_type, info_request_url, result_dict, is_int_component):
    try:
        response = _fetch_info_response(info_request_url)
    except Exception as exc:
        if is_int_component and request_type == "rankingmodels":
            return "", _IGNORE
        raise exc
    parsed_response = None

    required_max = _INFO_REQUESTS_LENGHTS.get(request_type, 1024 * 1024)
    if response is None:
        eh.check_failed("Response for info request of type '{}' is empty".format(request_type))

    if required_max and len(response) > required_max:
        result_dict['info_response'] = response
        eh.check_failed("Answer for info request of type '{}' is too long: {} bytes while allowed {} bytes".format(
            request_type, len(response), required_max,
        ))

    if response.startswith('error:'):
        result_dict['info_response'] = response
        eh.check_failed('Cannot process info request')

    try:
        parsed_response = _get_checker_by_req_type(request_type, is_int_component).parse(response)
    except Exception as exception:
        result_dict['info_response'] = response

        if 'Z0000000000000000' in response:
            logging.info("Z000 docId detected")
            if datetime.date.today() < datetime.date(2017, 9, 24):
                return response, _IGNORE

        eh.check_failed(
            "Cannot parse error response for info request:\n{}\n'{} (exception type: {})'".format(
                info_request_url, exception, type(exception),
            )
        )

    return response, parsed_response


def _send_info_request_and_check_relevance(search_url, search_response, request_type, doc,
                                           save_search_responses, is_int_component):
    info_request_url = _get_info_request_url(search_url, request_type, doc.DocId)

    result = {
        'info_request': info_request_url,
        'doc': doc,
        'request_type': request_type,
    }

    checker = _get_checker_by_req_type(request_type, is_int_component)

    start = time.time()

    try:
        info_response, parsed_response = _send_info_request_and_parse_response(request_type, info_request_url,
                                                                               result, is_int_component)

        if isinstance(parsed_response, str) and parsed_response == _IGNORE:
            # Spike for ignoring specific bugs
            result['time'] = time.time() - start
            result['passed'] = True
            result['ignored'] = True
            return result
        try:
            checker.check(parsed_response)
        except common_errors.TaskFailure as exc:
            logging.info(exc.message + " where response_info is %s and doc_id is %s", info_response, doc.DocId)
            eh.check_failed(exc.message)

        if request_type in _META_RELEVANCE_REQUEST_TYPES and not is_int_component:
            info_relevance = _get_relevance_from_response(parsed_response)
            search_factors = doc.Factors.split(" ") if doc.Factors else []
            info_factors = _get_factors_from_info_response(parsed_response)

            if len(search_factors) > 1:
                if not info_factors:
                    result['info_response'] = info_response
                    eh.check_failed("Factors not found in info request answer")
                min_length = min(len(search_factors), len(info_factors))
                search_factors = search_factors[:min_length]
                """ seems like bs, but let's leave it for history.
                if request_type not in _META_RELEVANCE_REQUEST_TYPES:
                    # if info-request handle on basesearch then Meta* factors is null, so align compared data
                    _reset_matrixnet_factor(search_factors)
                    _reset_matrixnet_factor(info_factors)
                    _reset_meta_factors(search_factors)
                """
                info_factors = info_factors[:min_length]

            # relevance calculated on ranking_middle include Meta* factors
            # so we can't compare it with basesearch relevance
            if doc.Relevance != info_relevance:
                result['factors'] = (search_factors, info_factors)
                eh.check_failed("Incorrect relevance {}, correct value: {}".format(info_relevance, doc.Relevance))

            if len(search_factors) > 1 and search_factors != info_factors:
                result['factors'] = (search_factors, info_factors)
                eh.check_failed("diff in factors")

        result['passed'] = True
    except common_errors.TaskFailure as exc:
        result['time'] = time.time() - start
        result['passed'] = False
        result['error'] = str(exc)

        if save_search_responses:
            # indeed, we need saved responses only in case of errors
            result['search_response'] = search_response

    return result


def _fill_meta_factors():
    # SEARCH-2136, etc
    factors = []
    for f in six.moves.xrange(800, 822 + 1):
        factors.append(f)
    for f in six.moves.xrange(824, 837 + 1):
        factors.append(f)
    factors += [
        842,   # MetaFeaturesVector
        862,   # LocalBclm
    ]
    for f in six.moves.xrange(890, 894 + 1):  # MetaLightreg*
        factors.append(f)
    factors += [
        900,   # MetaMoscowCommStableRandom
    ]
    for f in six.moves.xrange(1012, 1046 + 1):  # LB
        factors.append(f)
    for f in six.moves.xrange(1091, 1103 + 1):  # Streams
        factors.append(f)

    for f in six.moves.xrange(1228, 1234 + 1):  # Meta, Is*Request
        factors.append(f)
    return factors


_META_FACTORS = _fill_meta_factors()


def _reset_meta_factors(search_factors):
    """
        set all meta-search specific factors to "0" (before compare with base-search only factors from info request)
    """
    for idx in _META_FACTORS:
        if idx < len(search_factors):
            search_factors[idx] = "0"


def _reset_matrixnet_factor(factors):
    if not factors:
        return
    idx = 379  # MatrixNet
    if idx < len(factors):
        factors[idx] = "0"


class Params:
    def __init__(
        self,
        base_url,
        request_type,
        save_search_responses,
        save_patched_queries=False,
        dump_subsource_answer=False,
        disable_cache=False,
        is_int_component=False,
    ):
        self.base_url = base_url
        self.request_type = request_type
        self.save_search_responses = save_search_responses
        self.save_patched_queries = save_patched_queries
        self.dump_subsource_answer = dump_subsource_answer
        self.disable_cache = disable_cache
        self.is_int_component = is_int_component


def _patch_search_url(search_url, doc):
    # cannot use urlparse.parse_qs to parse query string
    # http://www.w3.org/TR/1999/REC-html401-19991224/appendix/notes.html#h-B.2.2
    _, query_string = search_url.split("?")

    params = sq.parse_cgi_params(query_string, unquote=False)

    if len(params['g']) > 1:
        return None

    if len(params['g']) == 1:
        return search_url

    raise Exception(
        "cannot find grouping param, mode: {}, attr: '{}', params: {}".format(
            doc.GroupMode,
            doc.GroupAttr,
            params['g'],
        )
    )


def _thread_func(queries, params):
    test_results = []

    dump_path = paths.get_unique_file_name(paths.get_logs_folder(), 'inforequests-debug-info.txt')
    with open(dump_path, 'w') as dump_file:
        for query in queries:
            search_url = params.base_url + query
            search_url = response_cgi.force_timeout(search_url)
            search_url += metadebug.dump_grouping_cgi

            if params.dump_subsource_answer:
                search_url += metadebug.dump_subsource_answer_cgi

            if params.disable_cache:
                search_url += "&nocache=da"

            # dbgrlv=da option is bad for metasearch cache
            search_hr_url = search_url.replace('dbgrlv=da', '') + "&hr=da&gta=_RelevFactors"

            if params.save_patched_queries:
                dump_file.write("URL: {}\n".format(search_hr_url))

            if len(search_hr_url) < 50:
                eh.fail("Wrong url: '{}'".format(search_hr_url))

            search_response = None
            search_response_as_tree = None
            checking_result = None
            answer_is_complete = None
            for attempt in range(0, 5):
                search_response, search_response_as_tree, checking_result = brp.fetch_response(
                    search_hr_url,
                    parse_and_check_response=True,
                )
                answer_is_complete = not brp.has_removed_docs(search_response_as_tree)
                if answer_is_complete:
                    break
                logging.warning("Retrying incomplete query, attempt %s (see SEARCH-2136)", attempt)
                time.sleep(11)  # sleep 11 seconds not to fall into fastcache (fast cache interval is 5 sec)

            if not answer_is_complete:
                dump_file.write("INCOMPLETE RESPONSE:\n")
                dump_file.write(search_response)
                dump_file.write("\n====================================\n\n")

            eh.ensure(answer_is_complete, "Cannot fetch complete answer for {}".format(search_hr_url))

            if not checking_result:
                if params.save_patched_queries:
                    dump_file.write("Docs: NOT FOUND\n")
                continue

            docs = brp.get_docs_from_response(search_response_as_tree)
            if params.save_patched_queries:
                doc_ids = [d.DocId for d in docs]
                doc_ids_str = ' '.join(doc_ids)
                dump_file.write("Docs: {}\n".format(doc_ids_str))

            for doc in docs:
                patched_search_url = _patch_search_url(search_url, doc)
                if not patched_search_url:
                    continue

                request_types = [params.request_type] if params.request_type else _INFO_REQUESTS_TYPES

                for request_type in request_types:
                    dump_file.write("Type: {}, doc: {}\n".format(request_type, doc.DocId))
                    result = _send_info_request_and_check_relevance(
                        patched_search_url,
                        search_response,
                        request_type,
                        doc,
                        params.save_search_responses,
                        params.is_int_component,
                    )
                    test_results.append(result)

    return test_results
