# -*- coding: utf-8 -*-

from urllib import unquote
import logging
import re

__all__ = ['Parser']


class Parser(object):
    """
        creates praser functional object
        __call__ : url, raw --> [results]
    """

    def __call__(self, url, raw, get_reqid=False):
        if raw == '':
            return []
        url = unquote(url)
        try:
            res = self.__parser_selector(url)(raw, get_reqid)
        except StandardError as e:
            logging.error('error while parsing search result page\n')
            logging.error('\t' + str(e) + '\n')
            raise
        return res

    def __parser_selector(self, url):
        """
            chooses parser from static methods of Parser class
        """
        if 'json_dump=1' in url:
            return Parser.__json_parser
        else:
            return Parser.__html_parser

    @staticmethod
    def __html_parser(input, get_reqid):
        """
            raw html search result --> [{'header' : header, 'url': url}]
            as of now, registers only numbered results
        """
        def safe_re_search(pattern, text):
            if not text:
                logging.warning('attempted search on empty text')
                return ''
            res = re.search(pattern, text)
            if res:
                return res.group()
            else:
                logging.error('counldn\'t find {0} pattern in text:'.format(pattern))
                logging.error(text)
                return ''

        # get list items - whole html code for each result item
        # only numbered items are accounted for
        pattern_li = re.compile(
            r'<li class="b-serp-item.*?/li>',
            re.S
        )
        result_items = filter(
            lambda item: 'b-serp-item__number' in item,
            re.findall(pattern_li, input)
        )

        # get h2 code of each item
        pattern_h2 = re.compile(r'<h2.*?/h2>', re.S)
        result_items = [
            safe_re_search(pattern_h2, item)
            for item in result_items
        ]
        # get urls
        pattern_href = re.compile(r'href=".*?"', re.S)
        url_items = [
            safe_re_search(pattern_href, item)
            for item in result_items
        ]
        url_items = map(
            lambda x: x.replace('href=', '').strip('\"'),
            url_items
        )

        # get headers
        parrent_span = re.compile(r'<span.*?/span>', re.S)
        header_items = [
            safe_re_search(parrent_span, item)
            for item in result_items
        ]
        # strip headers of html code
        pattern_sub = re.compile(r'<.*?>', re.S)
        pattern_empty = r''
        header_items = [
            re.sub(pattern_sub, pattern_empty, item)
            for item in header_items
        ]
        links = [
            {'header': unescape(header), 'url': unescape(url)}
            for header, url in zip(header_items, url_items)
        ]
        if get_reqid:
            reqids_found = re.findall('reqid&quot;:&quot;(.*?)&quot;', input)
            if len(set(reqids_found)) > 1:
                logging.warning(
                    "Multiple reqids found on page!\t\n{}\n{} returned".format(
                        '\t\n'.join(set(reqids_found)), reqids_found[0]
                    )
                )
            return (links, reqids_found[0])
        else:
            return links

    @staticmethod
    def __json_parser(input, get_reqid):
        """
            json dump of search result --> [{'header': header, 'url': url}]
        """
        if get_reqid:
            raise NotImplementedError("Please teach me to get reqid form json pages!")

        from json import loads
        res = loads(input)
        # wizard?
        regular = [
            {
                'header': (
                    doc['doctitle'].encode('latin-1').decode('utf-8').translate(None, '[]\x07')
                ),
                'url': doc['url']
            }
            for doc in res['tmpl_data']['searchdata']['docs']
        ]
        return regular


def unescape(text):
    """
        converts ampersand-based characters in text into regular characters
        courtesy of Frederik Lundh
    """
    try:
        import htmlentitydefs

        def fixup(m):
            text = m.group(0)
            if text[:2] == "&#":
                # character reference
                try:
                    if text[:3] == "&#x":
                        return unichr(int(text[3:-1], 16))
                    else:
                        return unichr(int(text[2:-1]))
                except ValueError:
                    pass
            else:
                # named entity
                try:
                    text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
                except KeyError:
                    pass
            return text  # leave as is
        return re.sub("&#?\w+;", fixup, text).encode('utf-8')
    except StandardError:
        return text
