# -*- coding: utf-8 -*-
#

from six.moves.cPickle import loads
from six.moves import zip, zip_longest
import cgi
import collections
import os
import sandbox.projects.common.search.response.state as srs

OUTPUT_NAME = "side_by_side.html"
OUTPUT_NAME_SHORT = "side_by_side_short.html"

_max_url_length = 64


def _unquote(s):
    if not s or len(s) < 2:
        return s
    if s[0] == '"' and s[-1] == '"':
        return s[1:-1]
    return s


def _d_style(doc):
    return ' style="background: {}" '.format(doc.color) if doc and doc.color else ''


class Doc(object):
    def __init__(self, relevance, src, url, pos, slices):
        if slices:
            if any(c for c in slices if c.islower()):
                slices = '?'  # reduce all strange intents to '?'
        self.relevance = relevance
        self.src = src
        self.url = url
        self.pos = pos
        self.slices = slices
        self.fmt_src = '{}[{}]'.format(slices if slices else '', _unquote(src))
        self.wiz_info = self.fmt_src

    def __eq__(self, other):
        return (
            self.relevance == other.relevance
            and self.src == other.src
            and self.url == other.url
            and self.pos == other.pos
        )

    def __str__(self):
        return '{} {} {} {}'.format(self.pos, self.relevance, self.url, self.fmt_src)
    __repr__ = __str__

    def change_pos_str(self):
        url = _unquote(self.url)
        short_url = url if len(url) < _max_url_length else (url[:_max_url_length] + '...')
        return '{} <b>{}</b> {} <font style="color:gray">{}</font>'.format(self.change_pos, self.relevance, short_url, self.fmt_src)


class Grouping(object):
    def __init__(self, name):
        self.name = name
        self.docs = []

    def __eq__(self, other):
        return self.name == other.name and self.docs == other.docs

    def __ne__(self, other):
        return not self.__eq__(other)

    def __str__(self):
        s = self.name + ':\n'
        for d in self.docs:
            s += '\t' + str(d) + '\n'
        return s
    __repr__ = __str__

    def contain(self, doc):
        for d in self.docs:
            if d.url == doc.url:
                return d

    def str_diff(self, other):
        s = ''
        if self.name != other.name:
            s += self.name + ' &rarr; ' + other.name
        else:
            s += self.name + ':\n'
        # TODO:
        for d in self.docs:
            if d not in other.docs:
                s += '--- '
            s += '\t' + str(d) + '\n'
        s += '===========================\n'
        for d in other.docs:
            if d not in self.docs:
                s += '+++ '
            s += '\t' + str(d) + '\n'
        return s

    def html_diff(self, other):
        s = '<table width="100%">\n'
        s += '<tr><th style="background:#e0e0e0" width="50%">' + _unquote(self.name)
        s += '</th><th style="background:#e0e0e0"  width="50%">' + _unquote(other.name) + '</th><tr>\n'
        for d in self.docs:
            d2 = other.contain(d)
            if not d2:
                d.color = '#FFB2B2'
                d.change_pos = "{} &rarr; ?".format(d.pos)
            else:
                d.color = None if d.pos == d2.pos else '#FFFFB2'
                d.change_pos = "{} &rarr; {}".format(d.pos, d2.pos)
        for d in other.docs:
            d2 = self.contain(d)
            if not d2:
                d.color = '#B2FFB2'
                d.change_pos = "? &rarr; {}".format(d.pos)
            else:
                d.color = None if d.pos == d2.pos else '#FFFFB2'
                d.change_pos = "{} &rarr; {}".format(d2.pos, d.pos)
        for d1, d2 in zip_longest(self.docs, other.docs):
            s += '<tr><td{}>'.format(_d_style(d1))
            if d1:
                s += d1.change_pos_str()
            s += '</td><td{}>'.format(_d_style(d2))
            if d2:
                s += d2.change_pos_str()
            s += '</td></tr>\n'
        s += '</table>'
        return s


def safe_dict(d):
    return d if d else {}


_collapse_url_prefixes = (
    '//market.yandex.',
    'yandex.ru/video/search?',
    'yandex.by/video/search?',
    'yandex.com.tr/video/search?',
    'https://gorod.yandex.ru/search?',
    'maps.yandex.ru',
    'http://www.startv.com.tr',
    'https://eksisozluk.com/',
    'https://itunes.apple.com/',
    'https://tr.wikipedia.org',
    'https://tr-tr.facebook.com',
    'https://twitter.com/',
    'https://www.facebook.com/',
    'www.haberturk.com',
    'www.hurriyet.com.tr/',
    'www.oyunskor.com/',
)


class Response(object):
    def __init__(self):
        self.grgs = {}

    def __eq__(self, other):
        return self.grgs == other.grgs

    def __ne__(self, other):
        return not self.__eq__(other)

    def str_diff(self, other):
        s = ''
        safe_grgs = safe_dict(self.grgs)
        other_safe_grgs = safe_dict(other.grgs)

        for name, grg in safe_grgs.iteritems():
            grg2 = other_safe_grgs.get(name)
            if not grg2:
                s += 'OLD:' + str(grg)
            elif grg != grg2:
                s += 'CHANGE: ' + grg.str_diff(grg2)
        for name, grg in other_safe_grgs.iteritems():
            if name not in safe_grgs:
                s += 'NEW:' + str(grg)
        return s

    def html_diff(self, other):
        s = ''
        safe_grgs = safe_dict(self.grgs)
        other_safe_grgs = safe_dict(other.grgs)

        for name, grg in safe_grgs.iteritems():
            grg2 = other_safe_grgs.get(name)
            if not grg2:
                grg2 = Grouping(name)
                s += grg.html_diff(grg2)
            elif grg != grg2:
                s += grg.html_diff(grg2)
        for name, grg in other_safe_grgs.iteritems():
            if name not in safe_grgs:
                grg2 = Grouping(name)
                s += grg2.html_diff(grg)
        return s

    def fake_docs_urls(self):
        """
            Набираем из ответа url-ы документов из FAKE источника (только из 'd' групировки)
            (предполагается, что это колдущики, которые можно классифицировать по url (иногда обрезанному))
        """
        grg = self.grgs.get('"d"')
        if not grg:
            return None
        urls = []
        for d in grg.docs:
            if d.src == '"FAKE"':
                u = _unquote(d.url)
                # hack for reduce mutated urls
                for prefix in _collapse_url_prefixes:
                    if u.startswith(prefix):
                        u = u[:len(prefix)] + '...'
                urls.append(u)
        return urls

    def ms_docs_info(self):
        """
            Набираем из ответа url-ы документов отмаркированные Slices=... (только из 'd' групировки)
            (предполагается, что это колдущики)
        """
        grg = self.grgs.get('"d"')
        if not grg:
            return None
        urls = []
        for d in grg.docs:
            if d.slices is not None:
                urls.append(d.wiz_info)
        return urls

    def not_fake_docs_urls(self):
        """
            Набираем из ответа url-ы документов не из FAKE источника (только из 'd' групировки)
            (предполагается, что это не колдущики)
        """
        grg = self.grgs.get('"d"')
        if not grg:
            return None
        urls = []
        for d in grg.docs:
            if d.src != '"FAKE"':
                u = _unquote(d.url)
                urls.append(u)
        return urls


class SbsInfo(object):
    def __init__(self):
        self.sp_stat1 = collections.defaultdict(int)
        self.sp_stat2 = collections.defaultdict(int)
        self.r1 = {}
        self.r2 = {}
        self.q = {}
        self.reqs = 0

    def add_diff(self, num, node1, node2, descr=None):
        _update_search_props_stat(self.sp_stat1, node1)
        _update_search_props_stat(self.sp_stat2, node2)
        self.r1[num] = _get_response(node1)
        self.r2[num] = _get_response(node2)
        self.q[num] = descr if descr is not None else str(num)
        self.reqs += 1

    def add_not_diff(self):
        self.reqs += 1


def merge_stat(dest, src):
    for k, v in src.iteritems():
        dest[k] = dest.get(k, 0) + v


class SbsMaker:
    def __init__(self, short_report=True, generate_wizards_stat=True):
        self.has_diff = False
        self.fake_src_stat1 = collections.defaultdict(int)
        self.fake_src_stat2 = collections.defaultdict(int)
        self.ms_stat1 = collections.defaultdict(int)
        self.ms_stat2 = collections.defaultdict(int)
        self.sp_stat1 = collections.defaultdict(int)
        self.sp_stat2 = collections.defaultdict(int)
        self.dup_host_stat1 = collections.defaultdict(int)
        self.dup_host_stat2 = collections.defaultdict(int)
        self.total_reqs = 0
        self.diffs = ''
        self.short_report = short_report
        self.generate_wizards_stat = generate_wizards_stat

    def merge_info(self, sbs_info):
        merge_stat(self.sp_stat1, sbs_info.sp_stat1)
        merge_stat(self.sp_stat2, sbs_info.sp_stat2)
        for num, q in sbs_info.q.iteritems():
            self.diff_response(sbs_info.r1[num], sbs_info.r2[num], num, q)
        self.total_reqs += sbs_info.reqs

    def diff_response_nodes(self, rn1, rn2, num, q):
        _update_search_props_stat(self.sp_stat1, rn1)
        _update_search_props_stat(self.sp_stat2, rn2)
        r1 = _get_response(rn1)
        r2 = _get_response(rn2)
        self.diff_response(r1, r2, num, q)
        self.total_reqs += 1

    def diff_response(self, r1, r2, num, q):
        self._update_stat(self.fake_src_stat1, r1.fake_docs_urls())
        self._update_stat(self.fake_src_stat2, r2.fake_docs_urls())
        self._update_stat(self.ms_stat1, r1.ms_docs_info())
        self._update_stat(self.ms_stat2, r2.ms_docs_info())
        _update_dup_host_stat(self.dup_host_stat1, r1)
        _update_dup_host_stat(self.dup_host_stat2, r2)

        if r1 != r2:
            self.diffs += '<hr />\n{}{}'.format(self.format_query(num, q), r1.html_diff(r2))

    @staticmethod
    def _update_stat(stat, upd_info):
        if not upd_info:
            return
        for i in upd_info:
            stat[i] += 1

    def format_query(self, num, q):
        return '<div class="cgi">{}</div>'.format(cgi.escape(q))

    def write(self, filename):
        with open(filename, 'wb') as fdiff:
            self.fout = fdiff
            self.write_header()
            self.write_stat()
            self.write_groupings()
            self.write_footer()
            self.fout = None

    def write_header(self):
        self.fout.write("""<!DOCTYPE html>
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=UTF-8"/>
<style>
body {
    background: white;
}
table {
    border-collapse: collapse;
}
td, th {
    border: 1px solid gray;
    padding: 3px;
}
div.cgi {
    font-family: monospace;
    overflow-x: auto;
    white-space: nowrap;
    padding: 2px;
    margin-bottom: 5px;
}
</style>
</head>
<body>
""")

    def write_stat(self):
        if self.generate_wizards_stat:
            self.fout.write(_format_stat(
                self.fake_src_stat1,
                self.fake_src_stat2,
                self.total_reqs,
                'Оценка показов колдунщиков [by FAKE source]',
                self.short_report,
            ))
            self.fout.write(_format_stat(
                self.ms_stat1,
                self.ms_stat2,
                self.total_reqs,
                'Оценка показов колдунщиков [by _Marker/Slices]',
                self.short_report,
            ))
            self.has_diff = True

        if len(self.sp_stat1) or len(self.sp_stat2):
            self.fout.write(_format_stat(
                self.sp_stat1,
                self.sp_stat2,
                self.total_reqs,
                'SearchProps/SearchProperties stat',
                self.short_report,
            ))
            self.has_diff = True

        if len(self.dup_host_stat1) or len(self.dup_host_stat2):
            self.fout.write(_format_stat(
                self.dup_host_stat1,
                self.dup_host_stat2,
                self.total_reqs,
                'Более одной ссылки на документы с одного хоста',
                self.short_report,
            ))
            self.has_diff = True

    def write_groupings(self):
        self.fout.write('<h3>Изменения в группировках</h3>')
        self.fout.write(self.diffs)

    def write_footer(self):
        self.fout.write('</body></html>\n')


def diff(
    queries,
    responses1,
    responses2,
    diff_path,
    compare_only_same_not_answer_level=True,
    generate_wizards_stat=True,
    short_report=False
):
    maker = SbsMaker(short_report, generate_wizards_stat)
    num = 0
    for q, r1, r2 in zip(queries, responses1, responses2):
        rn1 = loads(r1)
        rn2 = loads(r2)
        if not (
            compare_only_same_not_answer_level and
            srs.not_respond_state(rn1) != srs.not_respond_state(rn2)
        ):
            maker.diff_response_nodes(rn1, rn2, num, q)
        num += 1

    maker.write(os.path.join(diff_path, OUTPUT_NAME_SHORT if short_report else OUTPUT_NAME))
    return maker.has_diff


def _get_response(r):
    """
        parse response Node to response object
    """
    rsp = Response()
    grgs = r._nodes.get('Grouping')
    if grgs is None:
        return rsp
    for grg in grgs:
        if 'Attr' not in grg._props:
            continue
        name = grg._props['Attr'][0]
        g = Grouping(name)
        grps = grg._nodes['Group']
        pos = 0
        for grp in grps:
            d = grp._nodes.get('Document')
            if not d:
                continue
            pos += 1
            d = d[0]
            ai = d._nodes.get('ArchiveInfo')
            if not ai:
                continue
            url = ai[0]._props.get('Url')
            if not url:
                continue
            url = url[0]
            slices = None
            gtas = ai[0]._nodes.get('GtaRelatedAttribute')
            if gtas:
                for gta in gtas:
                    if gta._props.get('Key')[0] == '"_Markers"':
                        val = gta._props.get('Value')[0]
                        if val.startswith('"Slices='):
                            slices = val[8:].partition(':')[0]
                            break
            sd = d._props.get('ServerDescr')
            if sd:
                sd = sd[0]
            else:
                sd = '_unknown_'
            relevance = d._props.get("SRelevance") or d._props.get("Relevance")
            g.docs.append(Doc(relevance[0], sd, url, pos, slices))
        rsp.grgs[name] = g
    return rsp


def _update_dup_host_stat(stat, r):
    urls = r.not_fake_docs_urls()
    if not urls:
        return
    # left only scheme/host/port in url
    hosts = [_get_hostport(url) for url in urls]
    # count
    hs = {}
    for h in hosts:
        if h in hs:
            hs[h] += 1
        else:
            hs[h] = 0
    for k, hc in hs.iteritems():
        if hc:
            hc = str(hc + 1) + 'x'
            stat[hc] += 1


def _get_hostport(url):
    if url.startswith('http://'):
        url = url[7:]
    elif url.startswith('https://'):
        url = url[8:]
    return url.partition('/')[0]


def _update_search_props_stat(stat, rn):
    """
        Набираем из ответа важные SearchProp/SearchersProps-ы.
        Аналитика для упрощения анализа того, что происходит с выдачей.
    """
    _update_porno_pl_stat(stat, rn)
    _update_blur_stat(stat, rn)
    _update_fresh_for_video_wiz(stat, rn)
    _update_fresh_for_images_wiz(stat, rn)


def _update_porno_pl_stat(stat, rn):
    """
        SearcherProp, отвечающий за маркирование уровня порнушности запроса.
        Используется для отбора запросов в suggest
    """
    sps = rn._nodes.get('SearcherProp')
    prop_names = ('Porno.pl', 'PornoUpper.pl')
    if not sps:
        return
    for sp in sps:
        if sp.GetPropValue('Key') not in prop_names:
            continue
        k = 'Уровень порнушности для suggest-а: SearcherProp-Porno.pl=' + sp.GetPropValue('Value')
        stat[k] += 1


def _update_blur_stat(stat, rn):
    """
        SearcherProp, обозначающий срабатывание замыливания (blur) изображений в видеоколдунщике
    """
    sps = rn._nodes.get('SearcherProp')
    if not sps:
        return
    for sp in sps:
        if sp.GetPropValue('Key') != 'VideoSerpdata.BlurThumbnails':
            continue
        k = 'Размытие thumb-ов в видеоколдунщике SearcherProp-VideoSerpdata.BlurThumbnails=' + sp.GetPropValue('Value')
        stat[k] += 1


def _update_fresh_for_video_wiz(stat, rn):
    """
        SearcherProp, обозначающий срабатываение подмешшивания свежих данных в видеоколдунщике
    """
    sps = rn._nodes.get('SearcherProp')
    if not sps:
        return
    for sp in sps:
        if sp.GetPropValue('Key') != 'ApplyVideoBlender.IntentWeight/VIDEOQUICK':
            continue
        value = float(sp.GetPropValue('Value'))

        if value > 0:
            k1 = 'Подмешивания свежести в ответ видео SearcherProp-ApplyVideoBlender.IntentWeight/VIDEOQUICK>0'
            stat[k1] += 1
        if value >= 0.3:
            k2 = 'Подмешивания свежести в ответ видео в топ4 SearcherProp-ApplyVideoBlender.IntentWeight/VIDEOQUICK>=0.3'
            stat[k2] += 1


def _update_fresh_for_images_wiz(stat, rn):
    """
        SearcherProp, обозначающий срабатываение подмешшивания свежих данных в картиночный колдунщик
    """
    sps = rn._nodes.get('SearcherProp')
    if not sps:
        return
    for sp in sps:
        if sp.GetPropValue('Key') != 'ApplyImagesBlender.IntentWeight/VIDEOQUICK':
            continue
        value = float(sp.GetPropValue('Value'))

        if value > 0:
            k1 = 'Подмешивания свежести в ответ картинок SearcherProp-ApplyImagesBlender.IntentWeight/VIDEOQUICK>0'
            stat[k1] += 1
        if value >= 0.3:
            k2 = 'Подмешивания свежести в ответ картинок в топ10 SearcherProp-ApplyImagesBlender.IntentWeight/VIDEOQUICK>=0.3'
            stat[k2] += 1


def _format_ustat(u, v1, v2, nreqs):
    s = '<tr><td>'
    s += u
    s += '</td><td>'
    pc1 = 100.0*v1/nreqs
    s += '{} ({:.2f}%)'.format(v1, pc1)
    s += '</td><td>'
    pc2 = 100.0*v2/nreqs
    s += '{} ({:.2f}%)'.format(v2, pc2)
    pcd = pc2 - pc1
    pcc = (100.0 * pcd / pc1) if pc1 > 0.0001 else 10000.0
    if pcc > 10.0:
        color = '#B0FFB0'
    elif pcc < -10.0:
        color = '#FFB0B0'
    elif pcc != 0:
        color = '#F0F0F0'
    else:
        color = None
    s += '</td><td{}>'.format(' style="background: {}"'.format(color) if color else '')
    s += '{} ({:+.2f}%) ({:+.0f}%%)'.format(v2-v1, pcd, pcc)
    s += '</td></tr>'
    return s


def _format_stat(stat1, stat2, nreqs, descr, short_report):
    if not nreqs:
        return ''
    table_header = '<tr><th>Value</th><th>v1</th><th>v2</th><th>delta</th></tr>\n'
    s = '<h3>{}</h3>\n<table>\n{}'.format(descr, table_header)
    rows = []
    for u, v1 in stat1.iteritems():
        v2 = stat2.get(u, 0)
        if not short_report or v1 != v2:
            rows.append(_format_ustat(u, v1, v2, nreqs))
    for u, v2 in stat2.iteritems():
        if u in stat1:
            continue
        elif not short_report or v2 != 0:
            rows.append(_format_ustat(u, 0, v2, nreqs))
    if not rows:  # do not show empty tables
        return ''
    for row in sorted(rows):
        s += row
    s += '</table>'
    return s
