import logging
import re
from urllib.parse import urljoin, urlparse

from django.conf import settings

from intranet.search.core.sources.utils import fix_encoding
from intranet.search.core.swarm import Indexer
from intranet.search.core.utils import http, reraise_as_recoverable
from intranet.search.core.utils.cache import Urls
from intranet.search.core.utils.xml import parse_html


log = logging.getLogger(__name__)


class RobotIndexer(Indexer):
    http_timeout = 30
    http_retries = 1
    allowed_statuses = [200]
    deleted_statuses = [404]
    urls = None
    allow_redirects = True

    def __init__(self, options):
        super().__init__(options)

        self.api = settings.ISEARCH['api'][self.source][self.index]
        self.host = self.api.base_url()
        self.path = self.api['path']
        self.headers = self.api.headers()

        self.session = http.create_session(max_retries=3, verify=False, headers=self.headers)

    def compile(self, keys, flags=0):
        return [re.compile(p, flags) for p in keys]

    def check(self, url, _type, default):
        path = urlparse(url).path
        patterns = getattr(self, _type)

        return any((pattern.search(path) for pattern in patterns) if patterns else (default,))

    def is_catalog_url(self, url):
        # возвращает True, если это урл на следующую страницу каталога, иначе False
        return self.check(url, 'catalog', False)

    def is_document_url(self, url):
        # возвращает True, если это урл, который надо скачивать и обрабатывать как документ
        raise NotImplementedError

    def get_catalog_url(self, url):
        return url

    def normalize_url(self, url, base_url=None):
        base_url = base_url or self.host
        url = urljoin(base_url, url)
        try:
            return http.normalize_url(url)
        except Exception:
            return None

    def follow_urls(self, parsed_page, cur_url):
        a_filter = '//a[not(contains(@rel, "noopener") or contains(@rel, "nofollow"))]/@href'
        for url in parsed_page.xpath(a_filter):
            url = self.normalize_url(url, cur_url)

            if self.is_follow_url(cur_url, url):
                if self.is_catalog_url(url):
                    self.next('walk', url=url)
                else:
                    self.next('fetch', url=url)

    def is_follow_url(self, cur_url, url):
        raise NotImplementedError

    def fetch_page(self, url):
        try:
            return http.call_with_retry(
                self.session.get, url,
                _ok_statuses=self.allowed_statuses + self.deleted_statuses,
                _retries=self.http_retries,
                timeout=self.http_timeout,
                allow_redirects=self.allow_redirects
            )
        except Exception as e:
            log.exception('%s, url: %s', e, url)
            raise e

    @reraise_as_recoverable(*http.ERRORS)
    def do_fetch(self, url, **kwargs):
        log.info('Start fetching url: %s', url)
        url = self.normalize_url(url)
        if url is None:
            log.warning('Cannot parse url %s. Ignored.', url)
            return

        page = self.fetch_page(url)

        if page.status_code in self.deleted_statuses:
            log.info('Delete page %s because of %s status code', url, page.status_code)
            self.do_delete(url)
        else:
            if 'text/html' not in page.headers.get('content-type'):
                log.info('Skip page %s because of %s content type', url,
                         page.headers.get('content-type'))
                return
            extra = self.extra_fetch(url)
            extra.update(**kwargs)
            self.next('create', page=page, **extra)
            if self.allow_redirects and page.url != url:
                # удаляем страницу, с которой был редирект на другую
                log.info('Delete page %s because of redirect to %s', url, page.url)
                self.do_delete(url)

    def extra_fetch(self, url):
        return {}

    def get_walk_urls(self, page):
        return sorted(set(map(self.normalize_url, page.xpath('//a/@href'))))

    @reraise_as_recoverable(*http.ERRORS)
    def do_walk(self, url=None, **kwargs):
        full_url = self.get_catalog_url(url)
        raw_index_page = http.call_with_retry(
            self.session.get,
            full_url,
            _retries=self.http_retries,
            timeout=self.http_timeout,
            allow_redirects=self.allow_redirects,
        )

        index_page = parse_html(raw_index_page.content)

        for orig_url in self.get_walk_urls(index_page):
            wurl = self.normalize_url(orig_url, full_url)
            if wurl is None:
                log.warning("Can't parse url %s. Ignored.", orig_url)
                continue

            if not self.is_follow_url(url, wurl):
                continue

            if self.is_catalog_url(wurl):
                self.next('walk', url=wurl)
            elif self.is_document_url(wurl):
                self.next('fetch', url=wurl)

    def do_delete(self, url):
        doc = self.create_document(url)
        self.next('content', url=url, delete=True)
        self.next('store', document=doc, delete=True)


class ExtendedRobotIndexer(RobotIndexer):
    """Автоматически обходит каталог начиная с self.host, сам
    помечает страницы которые обошел.

    Минимально нужно переопределить только функцию
    get_doc, которая получает на вход распаршенный html и url документа.
    """
    allow = ()                  # регулярки разрешающие или
    disallow = ()               # запрещающие переходы
    catalog = ()                # или регулярки каталога
    document = ()                # или регулярки документа

    allowed_statuses = [200, 404]

    def __init__(self, options):
        super().__init__(options)

        for name in ('allow', 'disallow', 'catalog', 'document'):
            setattr(self, name,
                    self.compile(getattr(self, name, ()), re.IGNORECASE))

        self.urls = Urls(self.cache_storage, name='%s_urls' % self.index)

    def is_follow_url(self, cur_url, url):
        url = self.normalize_url(url, cur_url)
        if url is None:
            return False

        parsed_url = urlparse(url)

        if parsed_url.scheme not in ('http', 'https'):
            return False

        if not self.urls.visit_url(url):
            return False

        # если срабатывает на disallow, то туда не ходим
        if self.check(url, 'disallow', False):
            return False

        if parsed_url.netloc != urlparse(self.host).netloc:
            return False

        if self.allow:
            return self.check(url, 'allow', False)

        return True

    def get_catalog_url(self, url):
        if url is None:
            url = self.path

        url = self.normalize_url(url)
        return url

    def do_walk(self, url=None):
        url = self.get_catalog_url(url)
        self.urls.visit_url(url)
        return super().do_walk(url)

    def do_fetch(self, url, **kwargs):
        url = self.normalize_url(url)
        self.urls.visit_url(url)
        super().do_fetch(url)

    def is_document_url(self, url):
        return self.check(url, 'document', True)

    def do_create(self, page, **kwargs):
        if page.status_code == 404:
            return

        parsed = parse_html(page.content)

        # Пляски с кодировками
        try:
            parsed = fix_encoding(page.content, parsed)
        except UnicodeEncodeError:
            pass

        if not self.is_document_url(page.url):
            return

        doc = self.get_doc(parsed, page.url)

        if doc is not None:
            self.next('store', document=doc)

        self.follow_urls(parsed, page.url)

    def get_doc(self, page, url):
        raise NotImplementedError()
