import os
import re
import logging
from itertools import zip_longest
from urllib.parse import urlunsplit

from django.conf import settings
from django.utils.functional import cached_property

from intranet.search.abovemeta.utils import string_to_bool

from intranet.search.core.snippets.wiki import WikiSnippet

from intranet.search.core.sources.utils import (
    date_as_factor,
    normalize,
    truncate_chars,
    get_text_content,
    get_elements_content,
    timestamp_to_utc_date
)
from intranet.search.core.models import Indexation
from intranet.search.core.swarm import Indexer
from intranet.search.core.tvm import tvm2_client
from intranet.search.core.utils import http
from intranet.search.core.utils.xml import parse_html
from intranet.search.core.errors import RecoverableError
from . import utils

EMPTY_THRESHOLD = 200

log = logging.getLogger(__name__)


class BaseWikiSource(Indexer):
    batch_size_for_since_update = 100
    http_retries = 0
    catalog_retries = 3
    catalog_timeout = 20
    page_content_timeout = 120
    delete_codes = [404, 302, 301]
    retry_codes = [429]
    ok_codes = [200]
    ok_statuses = delete_codes + ok_codes
    language = 'ru'
    vodstvo_parts = {'faq', 'rukovodstvo', 'guide', 'manual', 'diy', 'vodstvo'}
    api = settings.ISEARCH['api']['wiki']

    def _get_headers(self):
        headers = self.api['wiki_catalog'].headers()
        headers[settings.TVM2_SERVICE_HEADER] = tvm2_client.get_service_ticket('wiki')
        return headers

    @cached_property
    def session(self):
        return http.create_session(
            max_retries=self.http_retries,
            verify=settings.ISEARCH_CA_BUNDLE,
            headers=self._get_headers(),
            params={'lang': self.language},
        )

    def do_setup(self, **kwargs):
        try:
            metrix = utils.prepare_metrix()
            for key, value in metrix.items():
                self.profile_storage.update('wiki', key, {'page_views': value})
        except:
            log.exception('Wiki profiles error: METRIX')

        super().do_setup()

    def do_walk(self, url=None, **kwargs):
        if self.options['keys']:
            # do not support `fetch_id` for an indexation with given keys since this indexation is probably small
            for key in self.options['keys']:
                self.next('fetch', uri=key)
            return

        query = {}
        if self.options['ts']:
            query['since'] = self.options['ts']
        if self.options['limit']:
            query['limit'] = self.options['limit']
        if self.options.get('last_wiki_pk'):
            # see below
            # https://wiki.yandex-team.ru/wiki/components/page-catalog/?from=%2Fwiki%2Fapi%2Fpage-catalog%2F
            query['gt_pk'] = self.options['last_wiki_pk']

        url = url or self.api['wiki_catalog'].url(query=query)
        response = http.call_with_retry(self.session.get, url, verify=False,
                                        timeout=self.catalog_timeout,
                                        _retries=self.catalog_retries)
        data = response.json()

        fetch_id = kwargs.pop('fetch_id', 0)
        for page in data['data']['pages']:
            self.next('fetch', uri=page['uri'], fetch_id=fetch_id, page_pk=page['pk'])
            fetch_id += 1

        if data['data']['next'] is not None:
            self.next('walk', url=data['data']['next'], fetch_id=fetch_id)

    def do_fetch(self, uri, **kwargs):
        log.info('Start fetching page: %s', uri)
        if uri.startswith('.deleted'):
            # из апи каталога uri удаленных страниц приходят в
            # таком виде: .deleted.1376158168.0/wiki/vodstvo/zagolovki
            # 1376158168 в префиксе - это время удаления документа
            # всё после первого слэша - настоящий uri страницы
            prefix, uri = uri.split('/', 1)
            updated_ts = int(prefix.split('.')[2])
        else:
            updated_ts = None
        api_url = self.api['wiki_page'].url().format(uri=uri)

        page = http.call_with_retry(self.session.get, api_url,
                                    verify=settings.ISEARCH_CA_CERTS,
                                    _ok_statuses=self.ok_statuses,
                                    allow_redirects=False,
                                    timeout=self.page_content_timeout)

        if page.status_code in self.ok_codes:
            data = page.json()['data']
            fetch_id = kwargs.pop('fetch_id', 0)
            # update even for the first fetch just to get the statistics
            if fetch_id % self.batch_size_for_since_update == 0:
                try:
                    last_wiki_pk = int(kwargs.pop('page_pk'))
                    logging.info(
                        'updating last_wiki_pk for indexation %s to %s',
                        self.indexation_id, last_wiki_pk
                    )
                    Indexation.objects.filter(id=self.indexation_id).update(
                        last_wiki_pk=last_wiki_pk
                    )
                    logging.info('last_wiki_pk for indexation %s has been updated', self.indexation_id)
                except Exception:
                    logging.exception('cannot update last_wiki_pk for indexation %s', self.indexation_id)
            url = self.get_doc_url(data['wiki_page_url'], data['frontend_host'])
            data['text_content'] = get_text_content(parse_html(data['html']))
            data.update(self.extra_fetch(url, data))
            self.next('create', url=url, data=data)
        elif page.status_code in self.delete_codes:
            self.do_delete(self.get_doc_url(uri), updated_ts=updated_ts)
        elif page.status_code in self.retry_codes:
            raise RecoverableError()

    def get_doc_url(self, uri, host=None):
        host = host or self.api['site_endpoint']['host']
        url = urlunsplit(('https', host, uri, '', '')).lower()
        return url

    def extra_fetch(self, url, data):
        return {'translated': {}, 'links': {}, 'search_queries': {}, 'is_moved': False}

    def do_delete(self, url, updated_ts=None, **kwargs):
        doc = self.create_document(url)
        if updated_ts:
            doc.updated_ts = updated_ts
        if self.need_content:
            self.next('content', url=url, delete=True)
        self.next('store', document=doc, delete=True)

    def emit_factor(self, doc, name, value):
        doc.emit_meta_factor(name, value)

    def emit_factors(self, doc, data):
        self.emit_factor(doc, 'isEmpty', data['is_empty'])

        self.emit_factor(doc, 'modifiedAt', date_as_factor(doc.updated_ts))
        self.emit_factor(doc, 'createdAt', date_as_factor(data['ctime']))

        url_path = data['wiki_page_url']
        url_parts = {s.lower() for s in url_path.split('/')}

        isVodstvo = int(bool(url_parts & self.vodstvo_parts))
        self.emit_factor(doc, 'isVodstvo', isVodstvo)

        # флаг, является ли страница кластером пользователя
        self.emit_factor(doc, 'userCluster', data['usercluster'])

        self.emit_factor(doc, 'isUser', int(bool(doc.url.find('/users/') + 1)))
        self.emit_factor(doc, 'nesting', (len(data['breadcrumbs']) - 1) // 2)

        # количество ссылок на страницу с других страниц вики, norm = 100
        self.emit_factor(doc, 'linkedFromCount', normalize(data['linked_from_count'], 100))

        # количество подстраниц, norm = 100
        self.emit_factor(doc, 'descendantsCount', normalize(data['descendants_count'], 100))

        # сколько раз страницу помещали в избранное, norm = 50
        self.emit_factor(doc, 'favoritedCount', normalize(data['favorited_count'], 50))

        # сколько пользователей кроме автора меняло страницу, norm = 50
        self.emit_factor(doc, 'modifiersCount', normalize(data['modifiers_count'], 50))

        # количество файлов у страницы, norm = 50
        self.emit_factor(doc, 'filesCount', normalize(data['files_count'], 50))

        # количество комментариев у страницы, norm = 50
        self.emit_factor(doc, 'commentsCount', normalize(data['comments_count'], 50))

        self.emit_factor(doc, 'isTrashTitle', int(utils.is_trash_title(data['title'])))

        # устарешим страницам ставим фактор архивности
        is_archive = bool(data.get('not_actual_since') or data.get('is_moved'))
        self.emit_factor(doc, 'isArchive', int(is_archive))

        if data['is_documentation']:
            self.emit_factor(doc, 'isWikiDoc', 1)

    def emit_facet_attrs(self, doc, data):
        size = settings.ISEARCH_WIKI_CLUSTER_FACET_DEPTH
        cluster_facets = []

        parents = zip_longest(data['parents'], data['translated'].get('parents', []))
        for parent, translated in parents:
            # В вики бывают пропуски страниц, например, у страницы /users/tmalikova/a/b
            # нет родительской страницы /users/tmalikova/a. В этом случае у родителя
            # /users/tmalikova/a не будет поля title.
            if parent['supertag'] == 'users' or not parent.get('title'):
                continue
            cluster_facets.append({
                'value': parent['supertag'],
                'label': parent['title'],
                'label_en': translated or parent['title'],
            })

        cluster_facets.append({
            'value': data['wiki_page_supertag'],
            'label': data['title'],
            'label_en': data['translated'].get('title') or data['title'],
        })
        cluster_facets = cluster_facets[:size]

        for i, cluster_data in enumerate(cluster_facets, 1):
            doc.emit_facet_attr(name='cluster%d' % i, **cluster_data)

        if data['is_documentation']:
            doc.emit_facet_attr('page_type', 'documentation',
                                label='Документация',
                                label_en='Documentation')

    def emit_acl_groups_whitelist(self, doc, data):
        for group in data.get('acl_groups_whitelist', []):
            doc.emit_search_attr('acl_groups_whitelist', group)

    def _parse_cache_row(self, row):
        return {'url': row['url'], 'data': row['raw']}

    def do_create(self, url, data=None, **kwargs):
        keywords = data.get('keywords', [])
        # Don't index pages with "noindex" keyword
        if 'noindex' in keywords:
            return

        content = parse_html(data['html'])
        modtime = timestamp_to_utc_date(int(data['mtime']))

        doc = self.create_document(url, updated=modtime)
        data['is_empty'] = int(len(' '.join(data['text_content'].split())) < EMPTY_THRESHOLD)

        doc.emit_search_attr('mtime', modtime.strftime('%s'))
        doc.emit_search_attr('s_cluster_one', data['cluster_one'])
        doc.emit_search_attr('is_empty', str(data['is_empty']))
        doc.emit_search_attr('s_type', 'wiki')
        doc.emit_group_attr('thread_id_grp', url)

        doc.emit_search_attr('public', str(data['public']))
        self.emit_acl_groups_whitelist(doc, data)
        for login in data.get('acl_users_whitelist', []):
            doc.emit_search_attr('acl_users_whitelist', login)

        self.emit_factors(doc, data)
        self.emit_facet_attrs(doc, data)

        raw_tags = data['wiki_page_tag'].split('/')
        camel = r'([A-ZА-Я]+[^A-ZА-Я]*)'
        tags = [' '.join(re.split(camel, tag)).strip() for tag in raw_tags]

        ticket_tag = 'span[contains(concat(" ", @class, " "), " wiki-ticket ")]'
        tickets = get_elements_content(content, [ticket_tag])
        links = data.get('links', {})

        authors = [
            ', '.join([a['author_login'], a['author_full_name'], a['author_full_name_en']])
            for a in data['authors']
        ]
        titles = [data['title'], data['translated'].get('title', '')]

        breadcrumbs = ' '.join(tags + raw_tags)

        if string_to_bool(os.environ.get('ISEARCH_ENABLE_WIKI_SUGGEST')):
            # ISEARCH-7005 specification support
            doc.emit_suggest_attr(titles)
            doc.emit_suggest_attr(url)
            doc.emit_suggest_attr(breadcrumbs)

        body = {
            'content': data['text_content'],
            'z_ns_hidden': {
                'translated': data['translated'].get('text_content', ''),
                'keywords': ', '.join(data.get('keywords', [])),
                'title': titles + get_elements_content(content, ['h1']),
                'subtitle': get_elements_content(content, ['h2', 'h3']),
                'minortitle': get_elements_content(content, ['h4', 'h5', 'h6']),
                'cluster_one': {
                    'id': data['cluster_one'],
                    'name': data['cluster_one_name'],
                    'supertag': data['cluster_one_supertag'],
                },
                'owner': authors,
                'tag_one': ' '.join((tags[-1], raw_tags[-1])),
                'breadcrumbs': breadcrumbs,
                'tickets': tickets,
                'links_urls': links.get('urls', ''),
                'links_texts': links.get('texts', ''),
            },
        }
        body['z_ns_hidden'].update(data.get('search_queries', {}))
        doc.emit_body(body)

        self.emit_snippet(doc, data, 'ru')
        if data['translated']:
            self.emit_snippet(doc, data, 'translated_en', is_auto_translated=True)

        if self.need_content:
            self.next('content', url=url, raw_data=data, updated_ts=doc.updated_ts)
        self.next('store', document=doc)

    def emit_snippet(self, doc, data, lang='ru', is_auto_translated=False):
        if is_auto_translated:
            url = doc.url + '/.ru-en'
            title = data['translated']['title']
            text_content = data['translated']['text_content']
        else:
            url = doc.url
            title = data['title']
            text_content = data['text_content']

        snippet = WikiSnippet({
            'url': url,
            'title': title,

            'breadcrumbs': data['breadcrumbs'],
            'description': truncate_chars(text_content, 200),

            'modtime_timestamp': doc.updated.strftime('%s'),
            'modtime': doc.updated,

            'public': data['public'],
            'is_empty': data['is_empty'],
            'is_documentation': bool(data['is_documentation']),

            'is_translated': is_auto_translated,
            'is_auto_translated': is_auto_translated,
        })
        doc.emit_snippet(snippet, lang)
