# coding=utf-8
import attr
import base64
import hashlib
import random
import string
from bs4 import BeautifulSoup, Tag
from collections import defaultdict
import re

from settings import SerpSettings
from Screenshotter import Image

__author__ = 'irlab'


@attr.s
class Serp(object):
    id = attr.ib(type=str)
    query_text = attr.ib(type=str, default=None)
    html_with_seanid = attr.ib(type=str, default=None)
    serp_elem_list = attr.ib(factory=list)
    """:type: list[SerpElement]"""


@attr.s
class SerpElement(object):
    seanid = attr.ib(type=str)
    dom_element = attr.ib()
    query_text = attr.ib(type=str)
    serp_id = attr.ib(type=str)
    skeleton = attr.ib(type=str, default=None)
    skeleton_md5 = attr.ib(type=str, default=None)
    short_skeleton = attr.ib(type=str, default=None)
    image = attr.ib(type=Image, default=None)


@attr.s
class SerpElementExampleSet(object):
    skeleton = attr.ib(type=str, default=None)
    short_skeleton = attr.ib(type=str, default=None)
    serp_elem_list = attr.ib(factory=list)
    short_skeleton_set = attr.ib(factory=set)
    # """:type : list[SerpElement]"""
    serp_elem_count = attr.ib(default=0)
    query_count = attr.ib(default=0)
    count_good_images = attr.ib(default=0)


class SkeletonEvaluator:
    def __init__(self, settings):
        assert isinstance(settings, SerpSettings)
        self.settings = settings

    def eval_skeleton(self, dom_element, css_props):
        assert isinstance(dom_element, Tag)
        assert isinstance(css_props, CssProps)
        # if dom_element.attrs.get('seanid') == '3vbqKZNi':
        #     print 'hi'

        heading_skeleton = self.eval_heading_skeleton(dom_element, css_props)
        if heading_skeleton:
            return heading_skeleton

        custom_skeleton = self.eval_custom_skeleton(dom_element, css_props)
        if custom_skeleton:
            return custom_skeleton

        soup = BeautifulSoup('', "html.parser")
        self.eval_skeleton_recursive(dom_element, soup, soup)
        self.simplify_similar_sequences_recursive(soup)

        if self.settings.SKELETON_USE_TAGSET:
            tag_set = self.get_tag_set(soup.children.next())
            skeleton = '\n'.join(sorted(tag_set))
        else:
            skeleton = soup.prettify('utf8')
            skeleton = re.sub(r'<div>\s*</div>', '<div/>', skeleton)
        return skeleton

    def _is_heading_tag(self, t, css_props):
        # if 'TvV1fe' in t.attrs.get('class', []):
        #     print 'aa'
        if t.name == 'g-tray-header' or t.attrs.get("role") == "heading" or t.name == "h2":
            return True
        if css_props.get_element_property(t, 'text-transform') == 'uppercase' and css_props.is_element_text_bold(t):
            return True
        return False

    def eval_heading_skeleton(self, tag, css_props):
        # if tag.attrs.get('seanid') == 'zc9YkFZm':
        #     print("aa")
        heading_tag = tag.find(lambda t: self._is_heading_tag(t, css_props))
        if not heading_tag:
            return False

        heading_text = None
        t = None
        for t in [heading_tag] + list(heading_tag.find_all()):
            has_children = t.find()
            if not has_children and t.get_text(strip=True):
                heading_text = t.get_text(strip=True)
                break
        if heading_text and tag.get_text(strip=True).startswith(heading_text):
            if heading_tag.name == 'h2' or css_props.is_element_text_bold(t, root=heading_tag):
                return u"<h2>" + heading_text + u"</h2>"

    def eval_custom_skeleton(self, tag, css_props):
        ############################
        def is_geo_image(t):
            if not t.name == 'img':
                return False
            if t.attrs.get('alt') == 'map image':
                return True
            data_bsrc = t.attrs.get('data-bsrc')
            return data_bsrc and data_bsrc.startswith('/maps/vt/data')
        if tag.find(is_geo_image):
            return u"<h2>geo_wizard</h2>"

        ############################
        def is_dark_header(t):
            return css_props.get_element_property(t, 'border-radius') == '8px 8px 0 0' \
                and (css_props.get_element_property(t, 'background-color') or css_props.get_element_property(t, 'background'))
        if tag.find(is_dark_header):
            headers = set()
            for header in tag.find_all(lambda t: self._is_heading_tag(t, css_props)):
                if not css_props.is_element_text_bold(header, root=tag):
                    continue
                header_text = header.get_text(strip=True)
                if header_text:
                    headers.add(u'  <h3>' + header_text + '</h3>')
            if headers:
                return u'<dark_header>\n' + '\n'.join(sorted(headers)) + '\n</dark_header>'

        ############################
        if tag.find(lambda t: t.name == 'span' and t.get_text(strip=True).startswith(u'Реклама\xb7')):
            return u'<h2>Реклама</h2>'

        ############################
        carousel = tag.find('g-scrolling-carousel')
        if carousel:
            if len(carousel.find_all(lambda t: t.name == 'div' and css_props.get_element_property(t, 'border-radius'))) > 1:
                if carousel.find(lambda t: t.text in (u'Посмотреть все записи', u'View all posts') and t.parent.find('button', recursive=False)):
                    if tag.find(lambda t: css_props.get_element_property(t, 'color') == '#70757A' and u'сообщен' in t.text):
                        return u'<h2>Forum Snippet Comments</h2>'
                    elif tag.find(lambda t: css_props.get_element_property(t, 'color') == '#70757A' and u'ответ' in t.text):
                        return u'<h2>Forum Snippet Answers</h2>'
                    else:
                        return u'<h2>Forum Snippet</h2>'

    def get_tag_set(self, tag):
        tag_set = set()
        tag_set.add(self.get_tag_with_css_string(tag))
        for child in tag.children:
            if isinstance(child, Tag):
                tag_set.update(self.get_tag_set(child))
        return tag_set

    def get_tag_with_css_string(self, tag):
        return '<' + tag.name + (' class="%s"' % ' '.join(sorted(tag.get("class", [])))) + '/>'

    def eval_skeleton_recursive(self, origin, soup, parent):
        tag = soup.new_tag(origin.name)
        if origin.get('id') and not self.skeleton_skip_id(origin['id']):
            tag['id'] = origin['id']
        css_class_list = origin.get("class", [])
        css_class_list = filter(lambda c: self.settings.skeleton_accepts_class(c), css_class_list)
        if css_class_list:
            tag['class'] = sorted(css_class_list)
        parent.append(tag)

        for child in origin.children:
            if not isinstance(child, Tag):
                continue
            if not self.skeleton_accepts_tag(child):
                continue
            self.eval_skeleton_recursive(child, soup, tag)

    def skeleton_accepts_tag(self, tag):
        for selector in self.get_selectors(tag):
            if selector in self.settings.SKELETON_SKIP_SELECTORS:
                return False
        return True

    def skeleton_skip_id(self, tag_id):
        return True # don't use id in skeleton
        # for regex in self.settings.SKELETON_SKIP_SELECTORS_REGEX:
        #     if re.match(regex, '#' + tag_id):
        #         return True
        # if SerpSettings.is_cbk(tag_id):
        #     return True
        # return False

    @staticmethod
    def get_selectors(tag):
        tag_selectors = [tag.name]
        if tag.get('id'):
            tag_selectors.append('#' + tag['id'])
        for css_class in tag.get("class", []):
            tag_selectors.append('.' + css_class)
        return tag_selectors

    def simplify_similar_sequences_recursive(self, tag):
        for c in tag.children:
            self.simplify_similar_sequences_recursive(c)

        children = list(tag.children)
        if len(children) < self.settings.SKELETON_CUT_SEQUENCES_LENGTH: return
        last_children_str = None
        last_similar_count = 0
        for i, c in enumerate(children):
            s = str(c)
            if s == last_children_str:
                last_similar_count += 1
                if last_similar_count >= self.settings.SKELETON_CUT_SEQUENCES_LENGTH:
                    c.extract() # delete children
            else:
                last_children_str = s
                last_similar_count = 0


class CssProps:
    def __init__(self, html=None):
        self.css_class2props = defaultdict(dict)
        """:type: dict[str, dict[str, str]]"""
        if html:
            self._parse(html)

    def _parse(self, html):
        for style_section in re.finditer(r'(?s)<style.*?</style>', html):
            for selector_match in re.finditer(r'(.+?){(.+?)}', style_section.group(0)):
                for class_name_match in re.finditer(r'\.([\w\d\-_]{3,20})', selector_match.group(1)):
                    if '.' + class_name_match.group(1) + ' ' in selector_match.group(1):
                        # не брать селекторы верхнего уровня, у которых есть уточнение
                        continue
                    class_name = class_name_match.group(1)
                    css_properties = selector_match.group(2)
                    for key_value_pair in css_properties.split(';'):
                        if ':' not in key_value_pair:
                            continue
                        css_prop, value = key_value_pair.strip().split(':', 1)
                        value = value.replace('!important', '').strip()
                        self.css_class2props[class_name][css_prop.strip()] = value

    def get_property(self, css_class_name, css_prop):
        return self.css_class2props[css_class_name].get(css_prop)

    def has_border(self, css_class_name):
        for css_prop in self.css_class2props[css_class_name]:
            if css_prop.startswith('border'):
                return True
        return False

    def is_bold(self, css_class_name):
        return self.css_class2props[css_class_name]['font-weight'] == 'bold'

    def is_element_with_round_border(self, element):
        css_class_list = element.attrs.get("class", [])
        element_with_border = filter(lambda c: self.get_property(c, 'border-radius'), css_class_list)
        return bool(element_with_border)

    def find_property_with_inheritance(self, element, css_prop, root=None, go_up_max=1000, startswith=False):
        depth = 0
        while element is not None and (root is None or element != root.parent) and depth <= go_up_max:
            for css_class in element.attrs.get("class", []):
                if startswith:
                    for name, value in self.css_class2props[css_class].iteritems():
                        if name.startswith(css_prop):
                            return value
                else:
                    prop_value = self.get_property(css_class, css_prop)
                    if prop_value:
                        return prop_value
            element = element.parent
            depth += 1

    def get_element_property(self, element, css_prop):
        return self.find_property_with_inheritance(element, css_prop, go_up_max=0)

    def is_element_text_bold(self, element, root=None, go_up_max=1000):
        font = self.find_property_with_inheritance(element, 'font', root=root, go_up_max=go_up_max, startswith=True)
        if font:
            return font.startswith('bold')


class SerpParser:
    def __init__(self, settings, short_settings):
        assert isinstance(settings, SerpSettings)
        self.settings = settings
        self.skeleton_evaluator = SkeletonEvaluator(settings)
        self.short_skeleton_evaluator = SkeletonEvaluator(short_settings)

        self.current_serp = None
        self.rand = random.Random(0)
        self.css_props = CssProps()

    def parse_serp(self, html):
        serp = Serp(id=self.html2serp_id(html))
        soup = BeautifulSoup(html, 'html.parser', from_encoding="utf8")
        serp.query_text = self.settings.get_query_text(soup)
        # print ">>", self.query_text

        self.current_serp = serp
        self.rand = random.Random(hash(serp.id))
        self.css_props = CssProps(html)

        if u'Показана упрощенная версия'.encode('utf8') in html:
            raise Exception('got a simplified serp')
        main_div = soup.find(id=self.settings.MAIN_DIV_ID)
        serp_element_list = self.find_serp_elements(main_div)
        for serp_element in serp_element_list:
            assert isinstance(serp_element, SerpElement)
            serp_element.skeleton = self.skeleton_evaluator.eval_skeleton(serp_element.dom_element, self.css_props)
            serp_element.skeleton_md5 = self.eval_skeleton_md5(serp_element.skeleton)
            serp_element.short_skeleton = self.short_skeleton_evaluator.eval_skeleton(serp_element.dom_element, self.css_props)
            serp_element.dom_element = None

        serp.html_with_seanid = soup.prettify(encoding='utf8')
        serp.html_with_seanid = serp.html_with_seanid.replace('<!DOCTYPE doctype html>', '<!doctype html>')
        serp.serp_elem_list = serp_element_list
        return serp

    def _find_element_by_seanid_recursive(self, root, seanid):
        for child in root.children:
            if not isinstance(child, Tag):
                continue
            if child.attrs.get('seanid') == seanid:
                return child
            find_recursive = self._find_element_by_seanid_recursive(child, seanid)
            if find_recursive:
                return find_recursive

    def find_element_by_seanid(self, html, seanid):
        soup = BeautifulSoup(html, 'html.parser', from_encoding="utf8")
        return self._find_element_by_seanid_recursive(soup, seanid)

    def find_serp_elements(self, root):
        ret = []
        for child in root.children:
            is_skip_down = self.is_skip_down(child)
            if is_skip_down:
                if isinstance(child, Tag):
                    ret += self.find_serp_elements(child)
            elif not self.is_empty_element(child):
                seanid = ''.join([self.rand.choice(string.ascii_letters + string.digits) for n in xrange(8)])
                child.attrs['sean'] = '1'
                child.attrs['seanid'] = seanid
                ret.append(SerpElement(
                    dom_element=child,
                    seanid=seanid,
                    query_text=self.current_serp.query_text,
                    serp_id=self.current_serp.id
                ))
        return ret

    def is_skip_down(self, element):
        if element.name not in self.settings.TAG_NAMES_NOT_TO_SKIP_DOWN:
            return True

        element_id = element.attrs.get("id")
        if element_id and '#' + element_id in self.settings.SELECTORS_TO_SKIP_DOWN:
            return True
        if element_id:
            for regex in self.settings.SKELETON_SKIP_SELECTORS_REGEX:
                if re.match(regex, '#' + element_id):
                    element_id = None
                    break

        css_class_list = element.attrs.get("class", [])
        css_class_list = filter(self.settings.skeleton_accepts_class, css_class_list)
        for css_class in css_class_list:
            if not css_class: continue
            if '.' + css_class in self.settings.SELECTORS_TO_SKIP_DOWN:
                return True

        if self.settings.SKELETON_SKIP_SELECTORS_NOBORDER:
            children = [c for c in element.children if isinstance(c, Tag)]
            if len(children) == 2 and children[0].attrs.get("role") == "heading" and self.css_props.is_element_with_round_border(children[1]):
                element.attrs["sean_block_with_heading"] = "1"
                children[1].attrs["sean_has_border"] = "1"
                return False

            return not self.css_props.is_element_with_round_border(element)
        else:
            if not element_id and not css_class_list:
                return True

        return False

    def is_empty_element(self, element):
        inner_text = element.get_text()
        if not inner_text:
            return True
        inner_text = re.sub(r'(?s)<!--.*?--!?>', '', inner_text)
        if not inner_text.strip():
            return True
        return False

    def html2serp_id(self, html):
        m = hashlib.md5()
        m.update(html)
        digest = base64.b64encode(m.digest(), altchars='-_')
        digest = digest[:20]
        return digest

    @staticmethod
    def eval_skeleton_md5(skeleton):
        if isinstance(skeleton, unicode):
            skeleton = skeleton.encode('utf8')
        assert isinstance(skeleton, str)
        return base64.b64encode(hashlib.md5(skeleton).digest())
