# -*- coding: utf-8 -*-

import argparse
import codecs
import json
import logging
import re
import requests
import time
import xml.etree.ElementTree


def get_config(input_config_file_path):
    with codecs.open(input_config_file_path, 'r', encoding='utf-8') as input_config_file:
        return json.load(input_config_file)


def __get_dump_config_formating_params():
    return {'indent': 2, 'ensure_ascii': False}


def dumps_config(config):
    return json.dumps(config, **__get_dump_config_formating_params()).encode('utf-8')


def dump_config(config, output_config_file_path):
    with codecs.open(output_config_file_path, 'w', encoding='utf-8') as output_config_file:
        json.dump(config, output_config_file, **__get_dump_config_formating_params())


__DEVICE_URL_PATHS = {
    'desktop': '/video/search',
    'touch': '/video/touch/search',
    'tablet': '/video/pad/search',
    'beta': '/video/beta/search'
}


def __get_quoted_query(s):
    return requests.utils.quote(s.encode('utf-8'), safe='')


# The function accepts only dict(unicode -> unicode).
# requests.compat.urlencode is not used because of:
#   1) json.load represents all strings as unicode - this behaviour is not accepted by requests.compat.urlencode
#   2) requests.compat.urlencode substitutes ' ' as '+'
def __get_cgis_string(cgis):
    unrolled_cgis = []
    for key in cgis.keys():
        if isinstance(cgis[key], list):
            for value in cgis[key]:
                unrolled_cgis.append((key, value))
        else:
            unrolled_cgis.append((key, cgis[key]))
    for k, v in unrolled_cgis:
        if (not isinstance(k, unicode)) or (not isinstance(v, unicode)):
            raise ValueError('Inccorect invocation of __get_cgis_string method: expected unicode')
    return '&'.join(
        map(lambda (k, v): __get_quoted_query(k) + '=' + __get_quoted_query(v), unrolled_cgis)
    )


def __remove_filtering_cgis(cgis):
    result = cgis.copy()
    filtering_cgis = [u'kids']
    for filtering_cgi in filtering_cgis:
        if filtering_cgi in result:
            del result[filtering_cgi]
    return result


def __adjust_thumb_cgis(cgis):
    result = cgis.copy()
    result[u'relev'] = u'pf=strict'
    result[u'waitall'] = u'da'
    result[u'timeout'] = u'10000000'
    result[u'no-tests'] = u'1'
    result[u'srcskip'] = []
    for source in ['QUICK', 'ULTRA']:
        for tag in ['', '_EXP', '_MISSPELL', '_EXP_MISSPELL']:
            result[u'srcskip'].append(u'VIDEO{}{}'.format(source, tag))
    return result


def __get_xml_search_url(top_level_domain, cgis):
    cgis = __adjust_thumb_cgis(cgis)
    return 'http://hamster.yandex.{}/video-xml?{}'.format(top_level_domain, __get_cgis_string(cgis))


def __get_xml_search_top_doc(top_level_domain, cgis):
    RETRIES_COUNT = 2
    SLEEP_TIME_IN_SECONDS = 0.05
    for should_remove_filtering_cgis in [False]:
        if should_remove_filtering_cgis:
            logging.warning('Remove filtering cgis')
            logging.warning('Cgis before = {}'.format(cgis))
            cgis = __remove_filtering_cgis(cgis)
            logging.warning('Cgis after = {}'.format(cgis))
        xml_search_url = __get_xml_search_url(top_level_domain, cgis)
        # print(xml_search_url)
        logging.info('Extracting from {}'.format(xml_search_url))
        for itr in range(RETRIES_COUNT):
            time.sleep(SLEEP_TIME_IN_SECONDS)

            if itr > 0:
                logging.info('Retrying...')

            try:
                xml_search_response = requests.get(xml_search_url, timeout=10.0)
            except Exception as error:
                logging.warning(error)
                continue

            if xml_search_response is not None and xml_search_response.ok:
                xml_tree_root = xml.etree.ElementTree.fromstring(xml_search_response.text.encode('utf-8'))
                doc_xml_tree_nodes = xml_tree_root.findall('./response/results/grouping/group/doc')
                if (doc_xml_tree_nodes is None) or (len(doc_xml_tree_nodes) == 0):
                    logging.debug('Incorrect format of response from xml search: expected to have ./response/results/grouping/group/doc')
                    continue
                if len(doc_xml_tree_nodes) == 0:
                    logging.debug('Incorrect format of response from xml search: expected to fetch at least one document')
                    continue
                return doc_xml_tree_nodes[0]
            else:
                logging.warning('Cannot fetch data from xml search')
                continue
    return None


def get_url_without_scheme_legacy(url):
    match = re.match(r'^https?:(.*)', url)
    if match is None:
        raise ValueError('Cannot match {} for legacy scheme format'.format(url))
    else:
        return match.groups()[0]


def enrich_config(config):
    for lang_code in config:
        lang = config[lang_code]

        if 'categories' not in lang:
            raise ValueError('Incorrect format of config file: expected to have categories for each language')

        for category in lang['categories']:
            if ('id' not in category) or ('queries' not in category):
                raise ValueError('Incorrect format of config file: expected to have id and queries for each category of language')

            # legacy. not sure if it's needed
            # lang['category_by_id'] = dict()
            # lang['category_by_id'][category['id']] = category

            shuffle_mode = ('mode' in category) and (category['mode'] == 'shuffle')
            if shuffle_mode:
                category['weight'] = category.get('weight', 1)

            query_index = 0
            while query_index < len(category['queries']):
                query = category['queries'][query_index]
                if shuffle_mode:
                    query['weight'] = query.get('weight', 1)

                query['query_links'] = dict()
                for device in __DEVICE_URL_PATHS:
                    cgis = query.get('cgi', {})
                    if u'text' not in cgis:
                        cgis[u'text'] = query['query']
                    query['query_links'][device] = __DEVICE_URL_PATHS[device] + '?' + __get_cgis_string(cgis)

                top_level_domain = lang.get('tld', 'ru')
                # temporary hack while yandex.ua works through proxy
                if top_level_domain == 'ua':
                    top_level_domain = 'ru'

                thumb_cgis = query.get('thumb_cgi', {})
                if u'text' not in thumb_cgis:
                    thumb_cgis[u'text'] = query['query']

                doc_xml_node = __get_xml_search_top_doc(top_level_domain, thumb_cgis)
                if doc_xml_node is None:
                    logging.warning('Cannot fetch data from xml search. Dropping query "{}"'.format(query))
                    del category['queries'][query_index]
                    continue

                color_xml_node = doc_xml_node.find('./video-properties/middle-color')
                if color_xml_node is None:
                    logging.warning('Incorrect format of response from xml search: expected to have middle color in doc video props')
                else:
                    query['color'] = color_xml_node.text

                thumbnail_xml_node = doc_xml_node.find('./image-properties/thumbnail-link')
                if thumbnail_xml_node is None:
                    logging.warning('Incorrect format of response from xml search: expected to have thumbnail link in all docs. Dropping query "{}"'.format(query))
                    del category['queries'][query_index]
                    continue

                query['thumb_url'] = get_url_without_scheme_legacy(thumbnail_xml_node.text)
                query_index += 1


def __main(args):
    config = get_config(args.input_config_file_path)
    enrich_config(config)
    dump_config(config, args.output_config_file_path)


def __get_args():
    parser = argparse.ArgumentParser(description='Enrich video popular config with data from xml search.')
    parser.add_argument('--input_config', type=str, required=True, dest='input_config_file_path')
    parser.add_argument('--output_config', type=str, required=True, dest='output_config_file_path')
    return parser.parse_args()


if __name__ == '__main__':
    __main(__get_args())
