import logging
from collections import defaultdict
from copy import deepcopy
from urllib.parse import urlencode, urljoin, urlparse, urlsplit, urlunsplit, parse_qs

from intranet.search.core.utils.xml import parse_html
from intranet.search.core.sources import utils
from intranet.search.core.swarm import RobotIndexer
from intranet.search.core.utils import http, reraise_as_recoverable
from intranet.search.core.utils.cache import Urls

from intranet.search.core.sources.doc.utils import get_links, get_camelcase_keywords

log = logging.getLogger(__name__)

BASE_DOCS_DOMAINS = {'testing.docs.yandex-team.ru', 'docs.yandex-team.ru'}
IGNORED_DOMAINS = {'passport.yandex-team.ru'}

products_raw = {
    'elliptics': {'ru': 'Elliptics', 'en': 'Elliptics'},
    'eblob': {'ru': 'Eblob', 'en': 'Eblob'},
    'rift': {'ru': 'Rift', 'en': 'Rift'},
    'thevoid': {'ru': 'TheVoid', 'en': 'TheVoid'},
    'smack': {'ru': 'Smack', 'en': 'Smack'},
    'grape': {'ru': 'Grape', 'en': 'Grape'},
    'historydb': {'ru': 'HistoryDB', 'en': 'HistoryDB'},
    'react': {'ru': 'React', 'en': 'React'},
    'blueprints': {'ru': 'Blueprints', 'en': 'Blueprints'},
    'swarm': {'ru': 'Swarm', 'en': 'Swarm'},
    'clickhouse': {'ru': 'ClickHouse', 'en': 'ClickHouse'},
    'qloud': {'ru': 'Qloud', 'en': 'Qloud'},
    'nile': {'ru': 'Nile', 'en': 'Nile'},
    'infra-docs-hurge': {'ru': 'Документация инфраструктуры поисковых интерфейсов',
                         'en': 'infra-docs-hurge'},
    'yt': {'ru': 'YT', 'en': 'YT'},
    'yql': {'ru': 'YQL (Старая документация)', 'en': 'YQL (Old documentation)'},
    'logbroker': {'ru': 'Logbroker', 'en': 'Logbroker'},
}

products = defaultdict(lambda: defaultdict(str))
products.update(products_raw)


class Source(RobotIndexer):
    source = 'doc'
    index = 'external'

    allow_redirects = True

    def __init__(self, options):
        super().__init__(options)
        self.urls = Urls(self.cache_storage, name='doc_urls')

    def do_setup(self, **kwargs):
        if self.options['keys']:
            for url in self.options['keys']:
                self.next('fetch', url=url)
        else:
            self.next('walk')

    def get_catalog_url(self, url=None):
        return url if url else self.api.url()

    def is_catalog_url(self, url):
        return False

    def is_document_url(self, url):
        # эта штука вызывается только из walk, а там все ссылки хороши
        return True

    def do_push(self, data, delete=False, **kwargs):
        if delete:
            self.do_delete(data['apiurl'])
        else:
            log.info('Got push with param delete=False, push data is %s', data)

    @reraise_as_recoverable(*http.ERRORS)
    def do_fetch(self, url, **kwargs):
        log.info('Start fetching url: %s', url)
        url = self.normalize_url(url)
        if url is None:
            log.warning('Cannot parse url %s. Ignored.', url)
            return

        page_url = url
        page = self.fetch_page(page_url)
        parsed = parse_html(page.content)

        parts = list(urlsplit(page_url))
        query = parse_qs(parts[3])
        page_lang_is_original = 'lang' not in query

        page_lang = utils.find_one(parsed, '//meta[@name="lang"]/@content') or 'all'
        page_langs = utils.find_one(parsed, '//meta[@name="langs"]/@content').split()

        langs_to_proceed = [page_lang]
        if page_lang_is_original:
            langs_to_proceed += [l for l in page_langs if l != page_lang]

        for lang in langs_to_proceed:
            if lang != page_lang:
                query.update({'lang': [lang]})
                parts[3] = urlencode(query, doseq=True)
                page_url = urlunsplit(parts)
                log.info('Fetching url: %s', page_url)
                page = self.fetch_page(page_url)

            if page.status_code in self.deleted_statuses:
                log.info('Delete page %s because of %s status code', url, page.status_code)
                self.do_delete(url)
                return
            else:
                if 'text/html' not in page.headers.get('content-type'):
                    log.info('Skip page %s because of %s content type', url,
                             page.headers.get('content-type'))
                    return

            extra = self.extra_fetch(page_url)
            extra.update(**kwargs)
            self.next('create', page=page, **extra)
            if self.allow_redirects and page.url != page_url:
                # удаляем страницу, с которой был редирект на другую
                log.info('Delete page %s because of redirect to %s', page_url, page.url)
                self.do_delete(page_url)

    def extra_fetch(self, url):
        return utils.get_popularity_factors(url)

    def check_reverbrain(self, url):
        return not url.query and not url.path.startswith('_')

    def check_github(self, url):
        parts = url.path.split('/')

        try:
            return parts[1] == 'pages'
        except IndexError:
            return False

    def check_general(self, url):
        # не индексируем английский и другие языки для кликхауса
        if 'clickhouse' in url.netloc and not url.path.startswith('/docs/ru/'):
            return False

        # вики и основной внутреннюю документацию не индексируем - для них есть свои индексаторы
        if url.netloc in ('doc.yandex-team.ru', 'wiki.yandex-team.ru'):
            return False

        return True

    def is_follow_url(self, cur_url, url):
        # Такое возможно, если пришли из каталога
        if cur_url is None:
            return True

        parsed_url = urlparse(url)

        if 'reverbrain' in parsed_url.netloc:
            check = self.check_reverbrain
        elif parsed_url.netloc == 'github.yandex-team.ru':
            check = self.check_github
        else:
            check = self.check_general

        parsed_cur = urlparse(cur_url)

        # проверяем домен: индексируем только внутри одного домена, кроме случая,
        # когда взяли ссылку с главной страницы аркадийной документации
        need_follow_domain = (
            parsed_url.netloc not in IGNORED_DOMAINS
            and (
                parsed_cur.netloc == parsed_url.netloc
                or parsed_cur.netloc in BASE_DOCS_DOMAINS and parsed_cur.path == '/'
            )
        )
        need_follow_path = True

        # В документации yql, к примеру, которая хостится на yql.yandex-team.ru есть ссылки
        # на операции в интерфейсе yql. Домен в этом случае совпадает, а вот из-под пути /docs/
        # страницы выходят. Не индексируем такие страницы.
        if parsed_cur.path.startswith('/docs/') and not parsed_url.path.startswith('/docs/'):
            need_follow_path = False

        if need_follow_domain and need_follow_path and check(parsed_url):
            return self.urls.visit_url(url)
        else:
            return False

    def do_create(self, page=None, **kwargs):
        if page.status_code != 200:
            log.error('Page %s returned %s', page.url, page.status_code)
            return

        parsed = parse_html(page.content)

        try:
            parsed = utils.fix_encoding(page.content, parsed)
        except (UnicodeEncodeError, UnicodeDecodeError):
            pass

        self.follow_urls(parsed, page.url)

        if 'reverbrain' in page.url and 'text' in page.headers.get('content-type'):
            doc_data = self.parse_reverbrain(page.url, parsed)
        elif utils.find_one(parsed, '//meta[@name="doc_type"]/@content') == 'external':
            doc_data = self.parse_external(page.url, parsed)
        elif utils.get_content_type(page) in ['text/html', 'text/xml']:
            doc_data = self.parse_general(page.url, parsed)
        else:
            log.warning('Skip "store" stage for %s', page.url)
            return
        if not doc_data:
            log.warning('Skip "store" stage for %s, no search content', page.url)
            return

        full_doc = self.prepare_doc(doc_data, **kwargs)
        doc_parts = utils.split_by_sub_titles(doc_data['content'])
        self.next('content', url=full_doc.url, raw_data=page.content, updated_ts=full_doc.updated_ts)

        if not doc_parts:
            self.next('store', document=full_doc)

        for part in doc_parts:
            part_data = deepcopy(doc_data)
            part_data['url'] = doc_data['url']
            part_data['title'] = doc_data['title']

            if part['title'] is not None:
                part_data['url'] = '#'.join([doc_data['url'], part['title'].attrib['id']])
                sub_title = utils.get_text_content(part['title']).strip(' #¶')
                part_data['title'] = ' - '.join([sub_title, doc_data['title']])

            part_data['content'] = part['content']
            part_data['is_part'] = True
            part_data['base_url'] = doc_data['url']

            self.next('store', document=self.prepare_doc(part_data, **kwargs))

    def prepare_doc(self, doc_data, **kwargs):
        doc = self.create_document(doc_data['url'])

        links = kwargs.get('links', {})
        text = utils.get_text_content(doc_data['content'])
        body = {
            'content': text,
            'z_ns_hidden': {
                'product': list(set([doc_data['product']] + list(doc_data['product_name'].values()))),
                'title': [doc_data['title']] + utils.get_elements_content(doc_data['content'], ['h1']),
                'subtitle': utils.get_elements_content(doc_data['content'], ['h2', 'h3']),
                'minortitle': utils.get_elements_content(doc_data['content'], ['h4', 'h5', 'h6']),
                'intlinks': [' '.join(b) for b in get_links(doc_data['content'])],
                'links_urls': links.get('urls', ''),
                'links_texts': links.get('texts', ''),
                'keywords': get_camelcase_keywords(text),
            },
        }
        extend_with_camelcase = ['title', 'subtitle', 'minortitle', 'intlinks']
        for key in extend_with_camelcase:
            to_modify = body['z_ns_hidden'][key]
            to_add = []
            for text in to_modify:
                to_add.extend(get_camelcase_keywords(text, add_full_words=False))
            to_modify.extend(to_add)

        body['z_ns_hidden'].update(kwargs.get('search_queries', {}))
        doc.emit_body(body)

        doc.emit_facet_attr('catalog', self.index, label='Внешний', label_en='External')
        doc.emit_facet_attr('type', self.index, label=self.index)

        if doc_data['product'] and doc_data['product_name']:
            doc.emit_facet_attr('product', doc_data['product'],
                                label=doc_data['product_name']['ru'],
                                label_en=doc_data['product_name']['en'])

        doc.emit_meta_factor('isStartPage', int(doc_data['is_start_page']))
        doc.emit_meta_factor('isExternal', 1)
        doc.emit_meta_factor('isArchive', 0)
        doc.emit_meta_factor('isDoc', 1)

        doc.emit_search_attr('s_type', 'doc')
        doc.emit_search_attr('is_empty', '0')
        doc.emit_search_attr('public', '1')
        doc.emit_search_attr('i_is_part', int(doc_data.get('is_part', False)))

        lang_search_attr = doc_data.get('language')
        if not set(doc_data.get('languages')) - {lang_search_attr}:
            # Документ на одном языке должен найтись для всех языков
            lang_search_attr = 'all'
        doc.emit_search_attr('s_doc_lang', lang_search_attr)

        doc.emit_group_attr('base_url', doc_data.get('base_url', doc_data['url'].rsplit('#')[0]))

        doc.emit_suggest_attr_by_parts(doc_data['title'])

        for lang in ('ru', 'en'):
            doc.emit_snippet(self.create_snippet(doc_data, lang), lang)
        return doc

    def create_snippet(self, doc_data, lang):
        return {
            'url': doc_data['url'],
            'title': doc_data.get('snippet_title') or doc_data['title'],
            'public': True,
            'description': utils.truncate_chars(utils.get_text_content(doc_data['content']), 400),
            'breadcrumbs': doc_data['breadcrumbs'],
            'product': doc_data['product_name'].get(lang),
            'modtime': None,
            'is_archive': False,
            'is_external': True,
            'version': doc_data.get('version') or '',
            'language': lang,
        }

    def parse_reverbrain(self, url, parsed):
        doc_data = {
            'url': url,
            'content': parsed.body,
            'title': '\n'.join(utils.get_elements_content(parsed, ['head/title', 'h1[@id!="site-title"]'])),
            'snippet_title': utils.get_text_content(utils.find_one(parsed, '//title')),
        }
        parsed_url = urlparse(url)
        parts = parsed_url.path.replace('/', '').split(':')
        product = parts[0]

        try:
            product_name = products[product]
        except KeyError:
            product_name = {}

        doc_data['product'] = product
        doc_data['product_name'] = product_name
        doc_data['is_start_page'] = ((len(parts) == 2 and parts[0] == parts[1]) or len(parts) < 2)
        doc_data['language'] = utils.find_one(parsed, '//meta[@name="lang"]/@content')
        doc_data['languages'] = utils.find_one(parsed, '//meta[@name="langs"]/@content').split()

        doc_data['breadcrumbs'] = []
        for a in parsed.xpath('//div[@class="breadcrumbs"]//a'):
            bc = {
                'url': urljoin(url, a.get('href')),
                'name': a.text,
            }
            doc_data['breadcrumbs'].append(bc)

        return doc_data

    def parse_external(self, url, parsed):
        search_content = utils.find_one(parsed, '//*[@data-search="content"]', default=None)
        if search_content is None:
            log.error('No content for %s found (external parser)', url)
            return

        doc_data = {
            'url': url,
            'content': search_content,
            'title': utils.get_text_content(utils.find_one(parsed, '//title')),
        }

        meta = {
            'updated': '',
            'product': '',
            'product_name_ru': '',
            'product_name_en': '',
            'start_page': 0,
            'version': '',
        }

        for k in meta.keys():
            meta[k] = utils.find_one(parsed, '//meta[@name="%s"]/@content' % k, meta[k])

        doc_data['is_start_page'] = int(meta['start_page'])
        doc_data['product'] = meta['product']
        doc_data['product_name'] = {
            'ru': meta['product_name_ru'] or meta['product'],
            'en': meta['product_name_en'] or meta['product'],
        }

        raw_bcs = utils.find_one(parsed, '//meta[@name="breadcrumbs"]/@content', None)
        bcs = utils.get_breadcrumbs(raw_bcs) if raw_bcs is not None else []
        for bc in bcs:
            bc['url'] = urljoin(url, bc['url'])
        doc_data['breadcrumbs'] = bcs
        doc_data['version'] = meta['version']
        doc_data['language'] = utils.find_one(parsed, '//meta[@name="lang"]/@content')
        doc_data['languages'] = utils.find_one(parsed, '//meta[@name="langs"]/@content').split()

        return doc_data

    def parse_general(self, url, parsed):
        content = utils.find_one(parsed, '//*[@role="main"]', default=None)
        if content is None:
            content = utils.find_one(parsed, '//main', default=None)
        if content is None:
            content = parsed.body

        doc_data = {
            'url': url,
            'content': content,
            'title': utils.get_text_content(utils.find_one(parsed, '//title')),
            'breadcrumbs': [],
            'is_start_page': 0,
        }

        parsed_url = urlparse(url)

        product_slug = utils.find_one(parsed, '//meta[@name="s_product"]/@content')
        product_name = utils.find_one(parsed, '//meta[@name="component_name"]/@content')
        if not product_slug:
            domains = parsed_url.netloc.split('.')
            try:
                product_slug = domains[0]
            except IndexError:
                pass

        doc_data['product'] = product_slug
        doc_data['product_name'] = {}
        if product_slug:
            doc_data['product_name'] = {
                'ru': product_name or products[product_slug]['ru'] or product_slug,
                'en': product_name or products[product_slug]['en'] or product_slug,
            }
        doc_data['language'] = utils.find_one(parsed, '//meta[@name="lang"]/@content')
        doc_data['languages'] = utils.find_one(parsed, '//meta[@name="langs"]/@content').split()
        return doc_data
