"""
Извлечение данных для кросс-ссылочных факторов:
собирает все тексты ссылок на страницу из вики, документации, стартрека и этушки
"""
import os
from logging import getLogger
from urllib.parse import urljoin, urlsplit, urlunsplit, unquote

from lxml.html import document_fromstring as parse
from requests.utils import requote_uri

from yt import yson
from yt.logger import LOGGER
from yt.wrapper import with_context, TablePath

from intranet.search.yt.jobs import BaseJob

log = getLogger(__name__)

# пустая схема - это урлы с относительной схемой вида //yandex.ru
ALLOWED_SCHEMES = ['http', 'https', 'mailto', '']
HIDEREFERER_DOMAIN = 'h.yandex-team.ru'
MAX_KEY_LENGTH = 16383  # максимальный размер ключа - 16Кб


def strip_wiki_marks(url):
    suffixes = ['.edit', '.create', '.preview']
    for suffix in suffixes:
        if url.endswith(suffix):
            url = url[:-len(suffix)]
    return url


def extract_real_url(url):
    scheme, netloc, path, query, fragment = urlsplit(url)
    if netloc == HIDEREFERER_DOMAIN:
        url = unquote(query)
    return url


def normalize_url(url, base_url=None):
    if not url:
        return None

    url = extract_real_url(url)

    # Этот знак - признак относительной страницы в вики
    if url.startswith('!/'):
        url = url[2:]

    if base_url:
        if not base_url.endswith('/'):
            base_url += '/'
        url = urljoin(base_url, url)

    url = url.lower()

    scheme, netloc, path, query, fragment = urlsplit(url)
    if scheme not in ALLOWED_SCHEMES:
        return None

    path = strip_wiki_marks(path)
    if path.endswith('/'):
        path = path[:-1]

    return requote_uri(urlunsplit((scheme, netloc, path, None, None)))


class Job(BaseJob):
    default_indexes = [
        ('wiki', ''),
        ('doc', ''),
        ('doc', 'external'),
        ('at', ''),
        ('st', ''),
    ]
    entity = 'factors'
    key_field = 'key'

    schema = [
        {'name': 'key', 'type': 'string'},
        {'name': 'urls', 'type': 'string'},
        {'name': 'texts', 'type': 'string'},
        {'name': 'links_count', 'type': 'int64'},
    ]

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.indexes = self.options.get('search_indexes') or self.default_indexes

    def get_page_html(self, row, context):
        raw = row.get('raw')
        html = []
        if not raw:
            return []

        search, index = self.indexes[context.table_index]
        if search == 'wiki':
            html.append(raw.get('html', ''))
        elif search == 'st':
            html.append(raw['issue']['descriptionHtml'])
            html.extend(comment.get('textHtml', '') for comment in raw['comments'])
        elif search == 'doc':
            html.append(raw)
        elif search == 'at':
            html.append(raw[1]['content'])
        return html

    @with_context
    def parse_links(self, row, context):
        """ Вытаскиваем из неудаленных документов все ссылки и их тексты,
        которые в них встречаются, т.е. находим все документы, на которые ссылается текущий
        """
        try:
            for html in self.get_page_html(row, context):
                if not html:
                    continue
                html = parse(html)
                page_url = row['url'].lower()

                for link in html.findall('.//a[@href]'):
                    try:
                        href = normalize_url(link.attrib['href'], base_url=page_url)
                        if not href or len(href) > MAX_KEY_LENGTH:
                            LOGGER.warning('Skip url %s on page %s', href, page_url)
                            continue

                        yield {
                            self.key_field: href,
                            'texts': link.text or '',
                            'urls': page_url,
                            'links_count': 1
                        }
                    except Exception as e:
                        LOGGER.exception('Error during processing link: %s, row: %s, error: %s',
                                         link.attrib['href'], row['url'], e)
        except Exception as e:
            LOGGER.exception('Error during processing row: %s, error: %s', row['url'], e)

    def join_links(self, key, rows):
        """ Собираем все ссылки на один документ вместе
        """
        data = {'texts': set(), 'urls': set()}
        links_count = 0
        for row in rows:
            for name in ('texts', 'urls'):
                data[name].add(row[name].strip())
            links_count += row['links_count']

        yield {
            self.key_field: key[self.key_field],
            'texts': '\n'.join(data['texts']),
            'urls': '\n'.join(data['urls']),
            'links_count': links_count
        }

    @property
    def input_tables(self):
        from intranet.search.core.sources.utils import get_yt_cache_table
        tables = []
        for search, index in self.indexes:
            tables.append(get_yt_cache_table(search=search, index=index))
        return tables

    def run(self):
        tmp_table = self._get_table(
            'links_tmp',
            delete=True,
            attributes={'schema': self.schema}
        )
        self.extract_links_data(tmp_table)
        self.sort_table(tmp_table)
        self.move_to_permanent_table(tmp_table)

    def extract_links_data(self, table):
        self.yt.run_map_reduce(
            mapper=self.parse_links,
            reducer=self.join_links,
            source_table=self.input_tables,
            destination_table=table,
            reduce_by=[self.key_field],
            stderr_table=self._get_table('errors'),
            sync=True,
            spec={'mapper': {'enable_input_table_index': True}}
        )

    def sort_table(self, table):
        sorted_schema = yson.YsonList(self.schema)
        sorted_schema[0]['sort_order'] = 'ascending'
        sorted_schema.attributes['unique_keys'] = True

        self.yt.run_sort(
            table,
            TablePath(str(table), attributes={'schema': sorted_schema}),
            sort_by=[self.key_field],
            sync=True,
            spec={
                "partition_job_io": {"table_writer": {"block_size": 256 * 2 ** 10}},
                "merge_job_io": {"table_writer": {"block_size": 256 * 2 ** 10}},
                "sort_job_io": {"table_writer": {"block_size": 256 * 2 ** 10}}
            }
        )

    def move_to_permanent_table(self, table):
        self.yt.alter_table(table, dynamic=True)
        self.yt.set(f"{table}/@tablet_cell_bundles", self.yt_opts['bundle'])

        out_table = self._get_table_path('links')
        self.yt.remove(out_table, force=True)
        self.yt.move(table, out_table)
        self.yt.mount_table(out_table, sync=True)

    def get_links_by_url(self, url):
        if not os.environ.get('ISEARCH_ENABLE_LINKS_FACTORS'):
            return {}

        try:
            url = normalize_url(url)
            path = self._get_table_path('links')

            data = list(self.yt.select_rows(f"* from [{path}] where key='{url}'"))
            if not data:
                return {}

            row = data[0]
            return {
                'texts': row['texts'],
                'urls': row['urls'],
                'links_count': row['links_count'],
            }
        except Exception:
            log.exception('Cannot get links by url: %s', url)
            return {}
