import os
import re
from collections import defaultdict
from hashlib import md5
from itertools import chain
from logging import getLogger

import requests
from django.conf import settings
from openpyxl import load_workbook

from intranet.search.core.snippets.hr import ClinicSnippet
from intranet.search.core.sources.people.utils import gen_phones, normalize_phone
from intranet.search.core.swarm import Indexer
from intranet.search.core.utils.http import call_with_retry


log = getLogger(__name__)


MIN_SUGGEST_LENGTH = 2


def clean(value):
    if not value:
        return value
    return value.strip().strip('-,;')


class Source(Indexer):
    """ Индексатор клиник, входящих в ДМС
    """
    CATALOG_URL = ('HR/Kompensacii/Straxovanie-2017-2018/'
                   'Straxovanie-sotrudnikov/DMS-/'
                   'Programma-straxovanija-sotrudniki-')

    base_api = settings.ISEARCH['api']['wiki']

    def do_setup(self, **kwargs):
        catalog_urls = self.options['keys'] or [self.CATALOG_URL]
        for uri in catalog_urls:
            files_api = self.base_api['wiki_page_files']
            url = files_api.url().format(uri=uri)
            response = call_with_retry(
                requests.get, url, headers=files_api.headers(),
                verify=settings.ISEARCH_CA_BUNDLE)

            for file_data in response.json()['data'].get('data', []):
                if not self.is_clinic_file(file_data):
                    continue
                self.next('walk', data=file_data)

    def is_clinic_file(self, data):
        clinic_part = 'клиники'
        return data['name'].endswith('.xlsx') and clinic_part in data['name'].lower()

    def do_walk(self, data, **kwargs):
        region = data['name'].split('_')[0]
        category = None
        current_clinic = {}
        all_clinics = defaultdict(dict)

        for clinic in self.get_clinics(data):
            if clinic['address'] is None:
                if clinic['name']:
                    # категории обслуживания в отдельных строках
                    category = clinic['name'].strip()
                else:
                    # если нет ни адреса, ни названия - то это какая левая пустая строка
                    log.warning('Got suspicious clinic from %s, data: %s', data['url'], clinic)
                continue

            for address in clinic['address'].split('\n'):
                # в разных категориях в одном файле могут быть одни и те же клиники,
                # нужно их объединить в один документ
                clinic_data = {'categories': [category] if category else [],
                               'region': region}
                for key, value in clinic.items():
                    clinic_data[key] = clean(value) or current_clinic.get(key)
                clinic_data['address'] = address

                address = self.normalize_address(clinic_data['address'], clinic_data['region'])
                clinic_data['normalized_address'] = address

                doc_id = self.get_doc_id(clinic_data)
                if doc_id in all_clinics:
                    all_clinics[doc_id]['categories'].append(category)
                else:
                    all_clinics[doc_id] = clinic_data

                # В некоторых файлах филиалы одной клиники сгруппированы вместе:
                # название клиники в одной объединенной строкое, адреса - в разных строках.
                # Сохраняем текущую клинику, чтобы заполнять названия и прочее для идуших за не филиалов.
                current_clinic = clinic_data

        for clinic in all_clinics.values():
            self.next('create', data=clinic)

    def get_clinics(self, data):
        file_name = self._download_file(data['url'])
        try:
            wb = load_workbook(file_name, read_only=True)
            sheet = wb.get_active_sheet()
            for i, row in enumerate(sheet.rows):
                if i == 0:
                    field_names = self._detect_fields(row)
                    continue
                values = (f.value for f in row)
                yield dict(zip(field_names, values))
        finally:
            os.unlink(file_name)

    def _detect_fields(self, row):
        field_map = {'name': 'название', 'address': 'адрес', 'metro': 'метро',
                     'phone': 'телефон', 'status': 'статус', 'working_hours': 'работ',
                     'site_url': 'сайт'}
        field_names = []
        for cell in row:
            if not cell.value:
                continue
            for key, value in field_map.items():
                if value in cell.value.lower():
                    field_names.append(key)
                    field_map.pop(key)
                    break
        return field_names

    def _download_file(self, url):
        """ Скачивает файл с клиниками с вики-сервера
        :param url: путь к файлу на вики
        :return: путь к скачанному файлу на сервере
        """
        endpoint = self.base_api['wiki_file']
        url = endpoint.url(path=url)
        local_filename = '/tmp/%s' % url.split('/')[-1]
        response = requests.get(url, stream=True, headers=endpoint.headers(),
                                verify=settings.ISEARCH_CA_BUNDLE)
        response.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
        return local_filename

    def do_create(self, data, **kwargs):
        doc_id = self.get_doc_id(data)
        doc = self.create_document(doc_id)

        name = clean(data['name'])
        if name.lower().startswith('new '):
            name = name[4:]

        metro = clean(data.get('metro', ''))
        address = clean(data['normalized_address'] or data['address'])
        phones = (p.strip() for p in re.split('[,;]', data['phone']))
        phones = [p for p in phones if p]

        body = {
            'name': name,
            'address': [data['region'], address],
            'phone': list(chain(*(gen_phones(phone) for phone in phones))),
            'metro': metro,
            'url': data['site_url'],
            'category': data['categories'],
        }
        doc.emit_body(body)

        snippet = ClinicSnippet({
            'id': doc_id,
            'categories': data['categories'],
            'name': name,
            'region': data['region'],
            'address': address,
            'phones': [normalize_phone(phone) for phone in phones],
            'metro': metro,
            'url': clean(data.get('site_url', '')),
            'status': clean(data.get('status', '')),
            'working_hours': clean(data.get('working_hours', '')),
        })
        doc.emit_snippet(snippet)

        for field in (name, address, metro):
            if not field:
                continue
            for part in re.split(r'[.,;()\s-]+', field):
                part = clean(part).strip('"\'\\')
                if len(part) >= MIN_SUGGEST_LENGTH:
                    doc.emit_suggest_attr(part)
        self.next('store', document=doc)

    def get_doc_id(self, data):
        clinic_id = '{}---{}'.format(data['name'], data.get('normalized_address') or data['address'])
        return md5(clinic_id.encode('utf-8', errors='ignore')).hexdigest()

    def normalize_address(self, address, region):
        endpoint = settings.ISEARCH['api']['geosearch']

        # убираем из города индекс
        address = re.sub(r'^\s*(\d{6})', '', address)
        # убираем станцию метро
        address = re.sub(r'ст[.\s]+м[\s.]+.+$', '', address)
        address = clean(address)

        if region.lower() in address.lower():
            text = address
        else:
            text = f'{region}, {address}'

        url = endpoint.url(query={'geocode': text, 'results': 1})
        session = requests.session()
        try:
            data = call_with_retry(session.get, url).json()
            metadata = data['response']['GeoObjectCollection']['featureMember'][0]['GeoObject']['metaDataProperty']
            geo_address = metadata['GeocoderMetaData']['Address']['Components']
            geo_address_dict = {k['kind']: k['name'] for k in geo_address}

            res = []
            allow_empty = {'street'}
            for f in ('locality', 'street', 'house'):
                try:
                    data = geo_address_dict[f]
                except KeyError:
                    if f in allow_empty:
                        continue
                    raise
                else:
                    res.append(data)
            return ', '.join(res)
        except Exception:
            log.exception('Cannot normalize clinic address: %s, %s', region, address)
            return None
