import logging
import os

from lxml.etree import ParserError
from markdown import markdown
from markdown.extensions.fenced_code import FencedCodeExtension
from svn.exception import SvnException

from django.conf import settings
from django.utils.functional import cached_property

from intranet.search.core.sources.utils import (
    get_text_content,
    get_elements_content,
    truncate_chars,
    get_popularity_factors,
)
from intranet.search.core.utils.svn import IsearchSvnClient
from intranet.search.core.utils.xml import parse_html
from intranet.search.core.utils import reraise_as_recoverable
from intranet.search.core.swarm import Indexer

log = logging.getLogger(__name__)


class Source(Indexer):
    """ Индексатор README файлов из аркадии
    """

    SVN_SSH_URL = 'svn+ssh://arcadia-ro.yandex.ru'
    ARCADIA_URL = 'https://a.yandex-team.ru'
    ARCADIA_PREFIX = '/arc/trunk/arcadia'
    README_FILENAME = 'README.md'

    @cached_property
    def svn_client(self):
        return IsearchSvnClient(self.SVN_SSH_URL, env=self.get_env())

    def get_env(self):
        return {
            'SVN_SSH': 'ssh -l {} -i {} -o "StrictHostKeyChecking=no"'.format(
                settings.ISEARCH_SSH_LOGIN,
                settings.ISEARCH_SSH_KEY_PATH
            )
        }

    def get_library_name(self, path):
        """Получаем название библиотеки по ее path

        Путь до файла имеет вид library_name/README.md,
        Убираем README.md
        """

        return path[:path.rfind('/')]

    def get_readme_paths(self):
        for path in self.svn_client.filter_recursive(
            self.ARCADIA_PREFIX, self.README_FILENAME, files_only=True
        ):
            # не индексируем ничего из junk
            if not path.startswith('junk'):
                yield (os.path.join(self.ARCADIA_PREFIX, path), path)

    def create_url(self, path):
        return self.ARCADIA_URL + path

    def do_walk(self, **kwargs):
        for full_path, path in self.get_readme_paths():
            self.next('fetch', full_path=full_path, path=path)

    def get_arcadia_file_content(self, path):
        content = self.svn_client.cat(path).strip()
        try:
            content = content.decode('utf-8')
        except UnicodeDecodeError:
            content = None
        return content

    @reraise_as_recoverable(SvnException)
    def do_fetch(self, full_path, path, **kwargs):
        log.info('Start fetching %s', full_path)
        content = self.get_arcadia_file_content(full_path)
        if not content:
            log.warning('Skip "store" stage for %s, no search content', self.create_url(full_path))
            return
        url = self.create_url(full_path)
        popularity_factors = get_popularity_factors(url)
        self.next('create', raw_content=content, full_path=full_path, path=path,
                  **popularity_factors)

    def do_create(self, raw_content, full_path, path, **kwargs):
        parsed = markdown(raw_content, extensions=[FencedCodeExtension()])
        try:
            # Иногда README.md содержит только комментарии
            parsed = parse_html(parsed)
        except ParserError as e:
            if str(e) == 'Document is empty':
                return
            raise

        url = self.create_url(full_path)
        doc = self.create_document(url)
        content = parsed.body

        doc_data = {
            'url': doc.url,
            'content': content,
            'product': self.get_library_name(path),
        }

        doc_data.update(self.get_titles(content, path))

        doc.emit_body(self.create_body(
            doc_data,
            kwargs.get('links', {}),
            kwargs.get('search_queries', {})
        ))

        doc.emit_facet_attr('catalog', 'arcadia', label='Аркадия', label_en='Arcadia')

        doc.emit_meta_factor('isExternal', 1)
        doc.emit_meta_factor('isArchive', 0)
        doc.emit_meta_factor('isDoc', 1)
        doc.emit_search_attr('s_type', 'doc')
        doc.emit_search_attr('is_empty', '0')
        doc.emit_search_attr('public', '1')
        doc.emit_search_attr('i_is_part', 0)
        doc.emit_search_attr('s_doc_lang', 'all')

        doc.emit_group_attr('base_url', url)

        doc.emit_suggest_attr_by_parts(doc_data['title'])

        doc.emit_snippet(self.create_snippet(doc_data))

        self.next('content', url=url, raw_data=raw_content, updated_ts=doc.updated_ts)
        self.next('store', document=doc)

    def get_titles(self, content, file_path):
        page_titles = get_elements_content(content, ['h1', 'h2', 'h3'])
        titles = {
            'subtitle': page_titles,
            'minortitle': get_elements_content(content, ['h4', 'h5', 'h6']),
        }
        title_parts = []
        if page_titles:
            title_parts.append(page_titles[0])
        title_parts.append(file_path)
        titles['title'] = ' – '.join(title_parts)
        return titles

    def create_snippet(self, doc_data):
        return {
            'url': doc_data['url'],
            'title': doc_data['title'],
            'public': True,
            'description': truncate_chars(get_text_content(doc_data['content']), 400),
            'breadcrumbs': [],
            'product': doc_data['product'],
            'is_archive': False,
            'is_external': True,
        }

    def create_body(self, doc_data, links, search_queries):
        body = {
            'content': get_text_content(doc_data['content']),
            'z_ns_hidden': {
                'product': doc_data['product'],
                'title': doc_data['title'],
                'subtitle': doc_data['subtitle'],
                'minortitle': doc_data['minortitle'],
                'links_urls': links.get('urls', ''),
                'links_texts': links.get('texts', ''),
            },
        }
        body['z_ns_hidden'].update(search_queries)
        return body
