# -*- coding: utf-8 -*-

import logging
import os.path
import re
import urllib2
import tarfile
import gzip
import unicodedata
import HTMLParser

from urlparse import urlparse, parse_qs

from sandbox import common, sdk2
from sandbox.projects import resource_types
from sandbox.projects.common.pumpkin import utils as pumpkin_utils
from sandbox.projects.common.pumpkin.serp_task import RU_DOMAIN, TR_DOMAIN


# (temporary) Don't require links to end strictly to avoid https://st.yandex-team.ru/SEARCH-11496#61700e4e3240775b92a49613
RU_CHECKLIST = {
    "вк": r"^https?://vk.com/",
    "одноклассники": r"^https?://ok.ru/",
    "ютуб": r"^https?://www.youtube.com/",
    "вконтакте": r"^https?://vk.com/",
    "mail": r"^https?://mail.ru/",
    "алиэкспресс": r"^https?://aliexpress.ru/",
    # (banned) "фейсбук": r"^https?://(www|ru-ru).facebook.com/",
    "фейсбук": r"^https?://ru.wikipedia.org/wiki/facebook",
    "google": r"^https?://(www)?.google.ru/",
    "яндекс почта": r"^https?://mail.yandex.ru/",
    "тнт онлайн": r"^https?://tnt-online.ru/",
}

TR_CHECKLIST = {
    "google": r"^https?://www.google.com.tr/",
}


class WebTestPumpkinSerp(sdk2.Task):
    class Parameters(sdk2.Task.Parameters):
        serp_resource_id = sdk2.parameters.LastReleasedResource("Serp archive", resource_type=resource_types.SERP_COLLECTOR_LOOT)
        urls_resource_id = sdk2.parameters.LastReleasedResource("Set of urls to retrieve", resource_type=resource_types.SERP_COLLECTOR_URLS)
        domain = sdk2.parameters.String("Domain", default=RU_DOMAIN, required=True)

    def on_execute(self):
        urls_path = str(sdk2.ResourceData(self.Parameters.urls_resource_id).path)
        serps_path = str(sdk2.ResourceData(self.Parameters.serp_resource_id).path)
        self._check_known_serps(urls_path, serps_path)

    def _check_known_serps(self, urls_path, serps_path):
        domain = self.Parameters.domain
        if domain == RU_DOMAIN:
            checklist = RU_CHECKLIST
        elif domain == TR_DOMAIN:
            checklist = TR_CHECKLIST

        fetch_urls = _find_serp_urls_by_queries(checklist.keys(), urls_path)
        if len(fetch_urls) != len(checklist):
            logging.error("Some urls not found: %r, expectd: %r", fetch_urls.keys(), checklist.keys())
            raise common.errors.TaskFailure("Incomplete check urls list")

        _check_serp_snippets_patterns(fetch_urls, checklist, serps_path)


def _check_serp_snippets_patterns(fetch_urls, checklist, serps_path):
    files = {}

    for query, url in fetch_urls.iteritems():
        path, _ = _get_index_path(url)
        files[path] = query

    found_files = set()
    with tarfile.open(serps_path) as tar:
        for item in tar:
            if item.name not in files:
                continue
            query = files[item.name]
            pattern = checklist[query]
            page_file = tar.extractfile(item)
            with gzip.GzipFile(item.name, "rb", 9, page_file) as page_gz:
                serp_urls = _parse_serp_urls(page_gz.read())
                found = False
                for serp_url in serp_urls:
                    if re.search(pattern, serp_url):
                        found = True
                if not found:
                    logging.error("Control url pattern %s not found in serp for query %s in %s",
                                  pattern, query, serp_urls)
                    raise common.errors.TaskFailure("Control url not found")
                else:
                    logging.info("Found pattern %s in serp with urls %s for query %s",
                                 pattern, serp_urls, query)
            found_files.add(item.name)


def _parse_serp_urls(data):
    regex = re.compile(
        r'''(?:
                <a\s+class=" 				# Search for HTML link tag
                (?:
                    [^"]*?       			# With any list of classes,
                    \s(?:serp-item__title-link|organic__url|organic__greenurl)  # organic url classes
                )
                (?:
                    [^"]*?"					# and any list of classes followed.
                )
                [^>]*?						# Then skip all attributes before 'href',
            )
            href="(
                [^"]+						# and get link from href.
            )"
            [^>]*?							# Skip all text to the end of tag
            >(
                .*?							# and get the tag body including all nested tags. NOTE: USE ONLY LAZY REGEX HERE
            )
            </a>							# stop at close tag.
        ''', re.MULTILINE | re.VERBOSE)
    links = regex.findall(data)
    urls = [urllib2.unquote(url.replace('//h.yandex.net/?', '')).strip().lower() for url, _ in links]
    return map(_original_page_url, urls)


def _get_index_path(url):
    index_url, _, hashname = pumpkin_utils.make_urls(url)
    return (os.path.join(".", hashname[0], hashname[1], hashname[2], hashname + '.html.gz'), index_url)


def _find_serp_urls_by_queries(queries, urls_path):
    fetch_urls = {}
    with tarfile.open(urls_path) as tar:
        for item in tar:
            item_content = tar.extractfile(item)
            if item_content is None:
                continue
            for url in item_content:
                text = parse_qs(urlparse(url).query).get('text')
                if not text:
                    continue
                for q in queries:
                    if q == text[0]:
                        fetch_urls[q] = url
    return fetch_urls


def _original_page_url(url):
    """
    Parse 'saved page copy' URL and extract original page URL from parameters.
    When given URL seems not to be a 'saved copy' URL just return the URL with no parsing at all.

    :param url: 'saved copy' page URL.
    :return: original page link when 'url' is a Yandex 'saved copy' URL.
    """

    # Some URLs can contain 'incorrect' urlencoded string which results into illegal characters after
    # URL decoding (unquoting). So, trying to decode such URL string as UTF-8 raises an UnicodeDecodeError.
    try:
        # Original 'saved copy' URL may contain HTML '&amp;' instead of '&' symbols, so replace them:
        html_replaced = HTMLParser.HTMLParser().unescape(url.decode('utf-8'))
    except UnicodeDecodeError:
        return url

    parsed_url = urlparse(html_replaced)

    if parsed_url.netloc == 'hghltd.yandex.net' and parsed_url.path == '/yandbtm':
        original_unicode_url = parse_qs(parsed_url.query)['url'][0]
        ascii_url = unicodedata.normalize('NFKD', original_unicode_url).encode('ascii', 'ignore')
        return ascii_url

    return url
