#! /usr/bin/env python
# -*- coding: utf-8 -*-

import sys
import codecs
import argparse
import re
import json
import urllib
from collections import namedtuple


LocaleInfo = namedtuple('LocaleInfo', ['gzt_code', 'gzt_name', 'in_prefix', 'geo_name'])

LOCALES = {
    'ru': LocaleInfo(1, 'RUS', u'в', 'name'),
    'en': LocaleInfo(2, 'ENG', 'in', 'Enname'),
    'tr': LocaleInfo(44, 'TUR', u'içinde', 'Trname')
}


INTERESTING_RUBRIC_IDS = {
    31313,  # торговый центр
    30764,  # развлекательный центр
    31433,  # железнодорожный вокзал
    31298   # аэропорт
}


def handle_value(value):
    return re.sub('[\"\'(),]', '', value.lower())


def is_reg_expr(expr):
    return re.search('[\[\]\+]', expr) is not None


def create_names_dict(names=[], contains_set=False):
    names_dict = dict()
    for locale in LOCALES:
        names_dict[locale] = set() if contains_set else []
    for locale, value in names:
        if locale in LOCALES:
            if contains_set:
                names_dict[locale].add(value)
            else:
                names_dict[locale].append(value)
    return names_dict


def create_geo_dicts():
    geo_dict = dict()
    geo_names_dict = create_names_dict(contains_set=True)
    url = 'http://geoexport.yandex.ru/?format=json&fields=id,Type,Parent'
    for locale_info in LOCALES.values():
        url += ',' + locale_info.geo_name
    url += '&types=6,2,3,4,5,7,10'
    geo_json = urllib.urlopen(url).read()
    geo_list = json.loads(geo_json)
    for geo in geo_list:
        geo_dict[geo['id']] = (int(geo['Type']), geo['Parent'])
        for locale, locale_info in LOCALES.items():
            geo_names_dict[locale].add(handle_value(geo[locale_info.geo_name]))
    return geo_dict, geo_names_dict


def create_companies_dict(path, geo_dict):

    def is_interesting_company(comp):
        return any(map(lambda r: r['rubric_id'] in INTERESTING_RUBRIC_IDS, comp['rubrics']))

    def get_major_geo_id(geo_id):
        cur_geo_id = geo_id
        cur_geo_info = geo_dict.get(geo_id, None)
        while cur_geo_info is not None:
            geo_type, geo_parent_id = cur_geo_info
            if geo_type <= 5:
                break
            cur_geo_id = geo_parent_id
            cur_geo_info = geo_dict.get(geo_parent_id, None)
        return (cur_geo_id if cur_geo_info is not None else None)

    companies_dict = dict()
    for line in open(path, 'r'):
        row = json.loads(line)
        if not is_interesting_company(row):
            continue
        names = []
        rubric_ids = map(lambda r: r['rubric_id'], row['rubrics'])
        for name_info_d in row['names']:
            name_type = name_info_d['type']
            name_d = name_info_d['value']
            locale = name_d['locale']
            value = handle_value(name_d['value'])
            if locale in LOCALES and (name_type == 'main' or name_type == 'short'):
                names.append((locale, value))
        raw_geo_id = row['address'].get('geo_id', None)
        if raw_geo_id is None:
            continue
        major_geo_id = get_major_geo_id(str(raw_geo_id))
        if major_geo_id is None:
            continue
        companies_dict[row['permalink']] = (create_names_dict(names), rubric_ids, major_geo_id)
    return companies_dict


def create_desc_dict(path, attr_names):

    def cmp_key(key):
        if is_reg_expr(key):
            return -sys.maxsize
        return -len(key)

    desc_dict = dict()
    for line in open(path, 'r'):
        row = json.loads(line)
        names = []
        for attr_name in attr_names:
            for name_d in row[attr_name]:
                locale = name_d['locale']
                value = handle_value(name_d['value'])
                if locale in LOCALES:
                    names.append((locale, value))
        desc_dict[row['id']] = create_names_dict(names)
    for rubric_id in desc_dict:
        for locale in LOCALES:
            desc_dict[rubric_id][locale].sort(key=cmp_key)
    return desc_dict


def create_to_tail_dict(path):
    to_tail_dict = dict()
    for line in open(path, 'r'):
        row = json.loads(line)
        to_tail_dict[row['id']] = row['to_tail']
    return to_tail_dict


def create_fix_dicts(path, companies_dict, geo_names_dict):
    remove_all_fix_dict = geo_names_dict
    add_fix_dict, remove_fix_dict = dict(), dict()
    for permalink in companies_dict:
        add_fix_dict[permalink] = create_names_dict(contains_set=True)
        remove_fix_dict[permalink] = create_names_dict(contains_set=True)
    if len(path) == 0:
        return add_fix_dict, remove_fix_dict, remove_all_fix_dict
    for line in open(path, 'r'):
        row = json.loads(line)
        action = row['action']
        locale = row['locale']
        value = row['value']
        if locale not in LOCALES:
            continue
        if action == 'remove_all':
            remove_all_fix_dict[locale].add(value)
            continue
        permalink = row['permalink']
        if permalink not in companies_dict:
            continue
        if action == 'add':
            add_fix_dict[permalink][locale].add(value)
        if action == 'remove':
            remove_fix_dict[permalink][locale].add(value)
    return add_fix_dict, remove_fix_dict, remove_all_fix_dict


def create_features(altay_desc_dict, main_desc_dict, help_desc_dict, to_tail_desc_dict, companies_dict, fix_dicts):

    def handle_with_fix(permalink, locale, compoused_values):
        add_fix_dict, remove_fix_dict, remove_all_fix_dict = fix_dicts
        values_set = set(filter(
            lambda value: value not in remove_fix_dict[permalink][locale] and value not in remove_all_fix_dict[locale],
            compoused_values
        ))
        for value in add_fix_dict[permalink][locale]:
            values_set.add(value)
        return list(values_set)

    def remove_descriptors(locale, cur_value, rubric_ids, cur_desc_dict):

        def remove_descriptor(desc, value):
            return re.sub('(^|\s)' + desc + '(\s|$)', ' ', value)

        for rubric_id in filter(lambda r_id: r_id in cur_desc_dict, rubric_ids):
            for rubric_value in cur_desc_dict[rubric_id][locale]:
                cur_value = remove_descriptor(rubric_value, cur_value)
            if locale != 'en':
                for rubric_value in cur_desc_dict[rubric_id]['en']:
                    cur_value = remove_descriptor(rubric_value, cur_value)
        return cur_value.strip(' ')

    def extract_all_values(locale, value, rubric_ids):
        if locale != 'ru' and re.search(ur'[а-я]', value):
            return ('', '', '')
        value = remove_descriptors(locale, value, rubric_ids, main_desc_dict)
        sub_value = value
        sub_value = remove_descriptors(locale, sub_value, rubric_ids, altay_desc_dict)
        abs_value = sub_value
        abs_value = remove_descriptors(locale, abs_value, rubric_ids, help_desc_dict)
        if not abs_value.isdigit() and re.search(r'[\w]', abs_value, flags=re.U) is None:
            return ('', '', '')
        return (value, sub_value, abs_value)

    def compouse_values(locale, cur_value, rubric_ids):
        if len(cur_value) == 0:
            return []
        compoused_values = {cur_value}
        for rubric_id in filter(lambda r_id: r_id in main_desc_dict and r_id in INTERESTING_RUBRIC_IDS, rubric_ids):
            for rubric_value in main_desc_dict[rubric_id][locale]:
                if not is_reg_expr(rubric_value):
                    compoused_values.add(rubric_value + ' ' + cur_value)
                    if to_tail_desc_dict[rubric_id] is True:
                        compoused_values.add(cur_value + ' ' + rubric_value)
        return list(compoused_values)

    features = []
    for permalink, company in companies_dict.items():
        names_dict, rubric_ids, geo_id = company
        for locale, locale_info in LOCALES.items():
            locale_values = names_dict[locale]
            compoused_values = []
            for value in locale_values:
                value, sub_value, abs_value = extract_all_values(locale, value, rubric_ids)
                compoused_values.extend(compouse_values(locale, abs_value, rubric_ids))
                if sub_value != abs_value:
                    compoused_values.extend(compouse_values(locale, sub_value, rubric_ids))
                if value != sub_value:
                    compoused_values.append(value)
            values = handle_with_fix(permalink, locale, compoused_values)
            if len(values) > 0:
                features.append((permalink, locale_info.gzt_code, map(lambda value: (locale_info.gzt_name, locale_info.in_prefix + ' ' + value), values), geo_id))
    return features


def write_gzt(features, output):

    def remove_symbols(value):
        return re.sub(r'[\W]', ' ', value, flags=re.U)

    def handle_spaces(value):
        return re.sub('\s+', ' ', value).strip()

    def gzd_name_format(name_info):
        locale_gzt_name, value = name_info
        return '{\"' + handle_spaces(remove_symbols(value)) + '\" lang=' + locale_gzt_name + ' morph=ALL_FORMS}'

    for permalink, locale_gzt_code, names_info_set, geo_id in features:
        print >> output, 'TBusinessFeature {'
        print >> output, '        key = ' + ' | '.join(map(gzd_name_format, names_info_set))
        print >> output, '        id = \"located_at\"'
        print >> output, '        value = \"' + str(permalink) + '\"'
        print >> output, '        lang = ' + str(locale_gzt_code)
        print >> output, '        geoid = ' + str(geo_id)
        print >> output, '}'
        print >> output, ''


def build_gzt(company='company.json', rubric='rubric.json', rubric_desc='rubric_desc.json', features_gzt='inner_org_features.gzt', fix_list='fix_list.json'):
    geo_dict, geo_names_dict = create_geo_dicts()
    companies_dict = create_companies_dict(company, geo_dict)
    altay_desc_dict = create_desc_dict(rubric, ['names', 'short_names', 'phrases'])
    main_desc_dict = create_desc_dict(rubric_desc, ['main_descriptors'])
    help_desc_dict = create_desc_dict(rubric_desc, ['help_descriptors'])
    to_tail_desc_dict = create_to_tail_dict(rubric_desc)
    fix_dicts = create_fix_dicts(fix_list, companies_dict, geo_names_dict)
    features = create_features(
        altay_desc_dict,
        main_desc_dict,
        help_desc_dict,
        to_tail_desc_dict,
        companies_dict,
        fix_dicts
    )
    output = codecs.open(features_gzt, 'w', 'utf-8')
    write_gzt(features, output)


def build_rubric_txt(rubric='rubric.json', rubric_txt='rubric_inner_org.txt'):
    output = codecs.open(rubric_txt, 'w', 'utf-8')
    for line in open(rubric, 'r'):
        row = json.loads(line)
        if row['publishing_status'] != 'publish' or row['type'] != 'ordinal':
            continue
        print >> output, row['permalink'] + '\tlocated_at;'


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--company')
    parser.add_argument('--rubric')
    parser.add_argument('--rubric_desc')
    parser.add_argument('--features_gzt')
    parser.add_argument('--fix_list')
    args = parser.parse_args(sys.argv[1:])
    build_gzt(args.company, args.rubric, args.rubric_desc, args.features_gzt, args.fix_list)
