import dateutil
import logging
import re
from html import unescape
from urllib.parse import urljoin, urlparse

from lxml import etree
from django.conf import settings

from intranet.search.core.sources.wiki.utils import prepare_metrix
from intranet.search.core.sources.utils import (
    find_one,
    get_breadcrumbs,
    date_as_factor,
    Services,
    get_elements_content,
    get_text_content,
    get_popularity_factors,
    get_by_lang,
)
from intranet.search.core.swarm import RobotIndexer
from intranet.search.core.sources.doc import utils
from intranet.search.core.utils import http, reraise_as_recoverable
from intranet.search.core.utils.cache import Urls
from intranet.search.core.utils.xml import parse_html

log = logging.getLogger(__name__)


def parse_date(date_str):
    date_str = re.sub(r'\(.+\)', '', date_str).strip()
    return dateutil.parser.parse(date_str) if date_str else None


class Source(RobotIndexer):
    index = 'doc'
    source = 'doc'
    languages = ('ru', 'en')

    catalog = [r'^/sitemap\.xml']
    document = [r'^(?!/sitemap\.xml).*']

    facets = {
        'doc': {'en': 'Internal', 'ru': 'Внутренний'},
        'archive': {'en': 'Archive', 'ru': 'Архив'},
        'external': {'en': 'External', 'ru': 'Внешний'},
    }

    allow_redirects = False
    allowed_statuses = [200, 404]

    def __init__(self, options):
        super().__init__(options)

        self.catalog = self.compile(self.catalog)
        self.document = self.compile(self.document)
        self.services = Services(self.cache_storage, key_field='slug', fields=('name', 'slug'))
        self.urls = Urls(self.cache_storage, name='%s_urls' % self.index)

    def prepare_caches(self):
        try:
            for key, value in prepare_metrix('doc').items():
                self.profile_storage.update('doc', key, {'page_views': value})
        except:
            log.exception('Doc profiles error: METRIX')

    def do_setup(self, **kwargs):
        self.prepare_caches()

        if self.options['keys']:
            for url in self.options['keys']:
                if self.is_catalog_url(url):
                    self.next('walk', url=url)
                elif self.is_document_url(url):
                    self.next('fetch', url=url)
        else:
            self.next('walk', url=self.api.url())

    def get_walk_urls_and_access(self, page):
        for node in page.xpath('//url'):
            loc = node.find('loc')
            access = node.find('access')

            if loc is not None:
                url = loc.text
            else:
                log.warning('No child loc. Can not get url')
                continue

            if access is not None:
                # Логины склеены через запятую
                acl_users_whitelist = access.text.split(',')
            else:
                acl_users_whitelist = []

            yield url, acl_users_whitelist

    @reraise_as_recoverable(*http.ERRORS)
    def do_walk(self, url, **kwargs):
        response = http.call_with_retry(
            self.session.get,
            url,
            _retries=self.http_retries,
            timeout=self.http_timeout,
            allow_redirects=self.allow_redirects,
        )

        page = parse_html(response.content)

        for page_url, acl_users_whitelist in self.get_walk_urls_and_access(page):
            if self.is_document_url(page_url):
                self.next('fetch', url=page_url, acl_users_whitelist=acl_users_whitelist)

    def is_document_url(self, url):
        return self.check(url, 'document', False)

    def is_follow_url(self, cur_url, url):
        return True

    @reraise_as_recoverable(*http.ERRORS)
    def do_fetch(self, url, **kwargs):
        log.info('Start fetching url: %s', url)
        url = http.cut_query(url)
        if not self.urls.visit_url(url):
            log.info('Already indexed %s', url)
            return

        pages = {}
        for lang in self.languages:
            page = self.fetch_page(f'{url}?lang={lang}')

            if page.status_code in self.deleted_statuses:
                log.info('Delete page %s because of %s status code', url, page.status_code)
                self.do_delete(url)
                return

            pages[lang] = page

        extra = self.extra_fetch(url)
        extra.update(**kwargs)
        self.next('create', url=url, pages=pages, **extra)

    def extra_fetch(self, url):
        return get_popularity_factors(url)

    def parse_page(self, page):
        parsed = parse_html(page.content)

        language = find_one(parsed, '//meta[@name="DC.Language"]/@content')
        translated_from = find_one(parsed, '//meta[@name="machine_translation_from"]/@content')

        data = {
            'url':  find_one(parsed, '//meta[@property="og:url"]/@content'),
            'title': find_one(parsed, '//title/text()'),
            'description': find_one(parsed, '//meta[@name="description"]/@content'),
            'main': find_one(parsed.body, '//main'),
            'raw': page.content,

            'doc_id': find_one(parsed, '//meta[@name="doc_id"]/@content'),
            'doc_name': find_one(parsed, '//meta[@name="doc_name"]/@content'),
            'doc_type': find_one(parsed, '//meta[@name="doc_type"]/@content', 'doc'),
            'component_name': find_one(parsed, '//meta[@name="component_name"]/@content'),
            'topic_name': find_one(parsed, '//meta[@name="topic_name"]/@content'),
            'categories': find_one(parsed, '//meta[@name="doc_categories"]/@content').split('/'),
            'service_slug': find_one(parsed, '//meta[@name="doc_abc_service"]/@content'),

            # В документации язык страницы равен 'ru', если она на английском, но автопереведена
            'language': 'en' if translated_from == 'ru' else language,
            'languages': set(find_one(parsed, '//meta[@name="langs"]/@content').split()),
            'translated_from': translated_from,
        }
        data['doc_full_name'] = '. '.join(filter(None, (data['component_name'], data['doc_name'])))
        data['breadcrumbs'] = self.get_breadcrumbs(parsed, data['doc_full_name'], data['topic_name'])

        for key, value in data.items():
            if isinstance(data[key], str):
                data[key] = unescape(value)

        return data

    def do_create(self, url, pages, **kwargs):
        doc = self.create_document(url)

        acl_users_whitelist = kwargs.get('acl_users_whitelist', [])
        public = not acl_users_whitelist

        doc.emit_facet_attr('type', self.index, label=self.index)
        doc.emit_search_attr('s_type', 'doc')
        doc.emit_search_attr('is_empty', 0)
        doc.emit_search_attr('i_is_part', 0)
        doc.emit_search_attr('s_doc_lang', 'all')
        doc.emit_search_attr('public', str(int(public)))
        for login in acl_users_whitelist:
            doc.emit_search_attr('acl_users_whitelist', login)

        doc.emit_meta_factor('isDoc', 1)
        doc.emit_meta_factor('isExternal', 0)
        doc.emit_meta_factor('modifiedAt', date_as_factor(doc.updated))
        doc.emit_meta_factor('createdAt', date_as_factor(doc.updated))

        doc.emit_group_attr('base_url', url)

        url_path = urlparse(url).path
        profile = self.profile_storage.get('doc', url_path.lower())
        doc.emit_meta_factor('pageViews', profile.get('page_views', 0))

        primary_data = None
        product_labels = {}
        content = []
        title = []
        subtitle = []
        minortitle = []
        parents = []

        for language in self.languages:
            page = pages[language]

            try:
                parsed = self.parse_page(page)
            except etree.XMLSyntaxError as error:
                log.error('%s\nCannot parse html: %s', url, error)
                return

            if not parsed['main']:
                log.info('Empty page %s', parsed['url'])
                continue

            if not primary_data:
                primary_data = parsed

            doc.emit_suggest_attr_by_parts(parsed['title'])
            product_labels[language] = parsed['doc_full_name']
            content.append(get_text_content(parsed['main']))
            title.append(parsed['title'])
            title += get_elements_content(parsed['main'], ['h1'])
            subtitle += get_elements_content(parsed['main'], ['h2', 'h3'])
            minortitle += get_elements_content(parsed['main'], ['h4', 'h5', 'h6'])
            parents += [' '.join(b.values()) for b in parsed['breadcrumbs']]

            if parsed['title']:
                snippet_title = parsed['title']
            elif parsed['topic_name']:
                snippet_title = ' - '.join([parsed['topic_name'], parsed['doc_full_name']])
            else:
                snippet_title = parsed['url']

            snippet = {
                'url': parsed['url'],
                'title': snippet_title,
                'product': parsed['component_name'],
                'public': public,
                'description': parsed['description'],
                'modtime': doc.updated,
                'breadcrumbs': parsed['breadcrumbs'],
                'is_archive': (parsed['doc_type'] == 'archive'),
                'is_external': False,
                'document': parsed['doc_id'],
                'document_name': parsed['doc_full_name'],

                'is_translated': language != settings.DEFAULT_LANGUAGE,
                'is_auto_translated': bool(parsed['translated_from']),
                'language': parsed['language'],
            }
            if language != settings.DEFAULT_LANGUAGE and parsed['translated_from']:
                snippet_language = f'translated_{language}'
            else:
                snippet_language = language
            doc.emit_snippet(snippet, snippet_language)

            if len(parsed['languages']) == 1:
                # есть документы только на одном языке, для них не заходим в цикл второй раз
                break

        if not primary_data:
            log.warning('Cannot get primary data for page %s', url)
            return

        doc.emit_facet_attr(
            'product',
            primary_data['doc_id'],
            label=get_by_lang(product_labels, 'ru'),
            label_en=get_by_lang(product_labels, 'en')
        )
        doc.emit_facet_attr(
            'catalog', primary_data['doc_type'],
            label=self.facets[primary_data['doc_type']]['ru'],
            label_en=self.facets[primary_data['doc_type']]['en'],
        )
        doc.emit_meta_factor('isArchive', int(primary_data['doc_type'] == 'archive'))

        service_names = []
        if primary_data['service_slug']:
            service_names.append(primary_data['service_slug'])
            service_data = self.services.get(primary_data['service_slug'])
            if service_data:
                service_names.extend(service_data['name'].values())

        links = kwargs.get('links', {})

        body = {
            # текст первого документа записываем в основной контент
            'content': content[0],
            'z_ns_hidden': {
                # остальное - в переводы, это отдельная зона, чтобы можно было считать по ней отдельные
                # зонные факторы и при необходимости выключить её из зон для построения пассажей
                'translated': content[1:],
                'title': title,
                'subtitle': subtitle,
                'minortitle': minortitle,
                'product': [primary_data['doc_id']] + list(product_labels.values()),
                'intlinks': [' '.join(b) for b in utils.get_links(primary_data['main'])],
                'parents': parents,
                'keywords': ', '.join(primary_data['categories']),
                'abc_service': service_names,
                'links_urls': links.get('urls', ''),
                'links_texts': links.get('texts', ''),
            },
        }
        body['z_ns_hidden'].update(kwargs.get('search_queries', {}))
        doc.emit_body(body)
        self.next('content', url=doc.url, raw_data=primary_data['raw'], updated_ts=doc.updated_ts)
        self.next('store', document=doc)

    def get_breadcrumbs(self, parsed, product_name, topic_name=None):
        breadcrumbs = find_one(parsed, '//meta[@name="breadcrumbs"]/@content')
        if not breadcrumbs:
            return []

        parsed_breadcumbs = get_breadcrumbs(breadcrumbs)
        try:
            # FIXME Здесь подпорка, чтобы первой ссылкой была ссылка на продукт.
            # Можно убрать, когда в документации она будет выводиться в апи
            if parsed_breadcumbs[0]['url'] == '/':
                parsed_breadcumbs = parsed_breadcumbs[1:]
            if parsed_breadcumbs[0]['name'] != product_name:
                first_link = find_one(parsed, '//div[contains(@class, "doc-c-menu")]/a[contains(@class, "link")]')
                product_link = {
                    'name': product_name,
                    'url': first_link.attrib['href'],
                }
                parsed_breadcumbs.insert(0, product_link)
            if not parsed_breadcumbs[-1]['name']:
                parsed_breadcumbs[-1]['name'] = topic_name
        except Exception as e:
            log.warning('Cannot generate product_link: %s', e)

        breadcrumbs_list = []
        try:
            for breadcrumb in parsed_breadcumbs:
                breadcrumb['url'] = urljoin(self.host, breadcrumb['url'])
                breadcrumbs_list.append(breadcrumb)
        except etree.XMLSyntaxError as e:
            log.error('Cannot parse breadcrumbs: %s', e)

        return breadcrumbs_list
