#!/usr/bin/env python
# coding=utf-8

import re

import yt.wrapper as yt


class Mapper():
    def __init__(self):
        self.search_atoms_pattern = re.compile(r"\s*_")

    def __call__(self, row):
        norm_freq = row['norm_freq']
        categs = row['categs']

        # удаляем информацию об атомах
        start_pos = self.search_atoms_pattern.search(categs).start()
        categs = categs[:start_pos]

        norm_freqs = map(lambda x: x.split(':'), norm_freq.split(','))

        for phr, freq in norm_freqs:
            words = phr.split(' ')

            count_word = len(words)
            # выбрасываем сфразы из одного слова
            if count_word < 2 or count_word > 6:
                continue

            for i in range(count_word):
                subph = ' '.join(words[:i] + words[i+1:])

                word_categs = words[i]+" "+categs

                yield {
                    'subph': subph,
                    'word_categs': word_categs,
                    'freq': int(freq)
                }


class Reducer(object):
    def __init__(self):
        self.max_query_tails = 2000

    def __call__(self, key, recs):
        full_freq = 0
        norm_freq = dict()

        for rec in recs:
            full_freq += rec['freq']
            norm = rec['word_categs']
            freq = rec['freq']
            if norm not in norm_freq:
                norm_freq[norm] = freq
            else:
                norm_freq[norm] += freq

        sorted_norm_freq = [[k, v] for k, v in norm_freq.items()]

        # сортируем хвостики по частоте (при равестве частот - лексикографически)
        sorted_norm_freq.sort(key=lambda x: x[0], reverse=True)
        sorted_norm_freq.sort(key=lambda x: x[1], reverse=True)
        sorted_norm_freq = sorted_norm_freq[:self.max_query_tails]

        freq_word_categs = ' , '.join([_norm + ' ' + str(_freq) for _norm, _freq in sorted_norm_freq])
        freq_word_categs = re.sub(r'\s{2,}', r' ', freq_word_categs)

        yield {'subph': key['subph'],
               'freq_full': full_freq,
               'freq_word_categs': freq_word_categs}


def generate_tails(yt_client, in_syns_categs_table, dst_tails_table):
    yt_client.run_map_reduce(Mapper(),
                             Reducer(),
                             in_syns_categs_table,
                             dst_tails_table,
                             reduce_by=['subph'])

    yt_client.run_sort(dst_tails_table, sort_by=['subph'])


def test_generated_tails(yt_client, old_table, new_table, out_table):
    def reduce(key, recs):
        recs = iter(recs)
        first = next(recs)
        if first['@table_index'] != 0:
            yield {'error_code': 1, "subph": first['subph']}
            return

        words = first['freq_word_categs'].split(',')
        word_freq = [word.split(':') for word in words]

        word_freq_dict = dict()
        for word, freq in word_freq:
            if word not in word_freq_dict:
                word_freq_dict[word] = int(freq)
            else:
                word_freq_dict[word] += int(freq)

        if len(words) > 1998:
            return

        for rec in recs:
            if rec['@table_index'] == 0:
                yield {'error_code': 2, "subph": first['subph']}
                return

            words = rec['freq_word_categs'].split(',')
            word_freq = [word.split(':') for word in words]

            for word, freq in word_freq:
                if word in word_freq_dict:
                    word_freq_dict[word] -= int(freq)
                else:
                    yield {'error_code': 5, "subph": first['subph']}
                    return

        if sum(word_freq_dict.values()) != 0:
            yield {'error_code': 6, "subph": first['subph']}
            return

    yt_client.run_sort(new_table, sort_by=['subph'])

    yt_client.run_sort(old_table, sort_by=['subph'])

    yt_client.run_reduce(reduce,
                         [new_table, old_table],
                         out_table,
                         reduce_by=['subph'],
                         format=yt.YsonFormat(control_attributes_mode="row_fields"),
                         )
