#!/usr/bin/env python
# coding=utf-8


def generate_snorms_and_syns(bmyt_cl, src_count_table, dst_snorm_table, dst_syns_table, lang):
    with bmyt_cl.yt_client.Transaction() as tx, \
            bmyt_cl.yt_client.TempTable() as tmp_table:
        bm_mapper = {
            'begin': '''
                my $lang = '%s' // '';
                use BaseProject;
                my $proj = BaseProject->new({
                    load_languages => [ qw(ru en tr) ],
                    load_dicts => 1,
                    load_minicategs_light => 1,
                    allow_lazy_dicts => 1,
                    use_comptrie_subphraser => 1,
                    use_sandbox_categories_suppression_dict => 1,
                });

                $proj->categs_tree->never_read_categs_cache(1);
                $proj->categs_tree->never_write_categs_cache(1);

                $self->{proj} = $proj;
                $self->{language} = $lang ? $proj->get_language($lang) : $proj->default_language;
            ''' % lang,

            'mapper': '''
                my $text = $r->{'norm'};
                next if $text =~ /~0$/;

                my $phr = $self->{language}->phrase($text);
                $r->{'snorm'} = $phr->snorm_phr;

                yield($r => YT_TMP_TABLE);
            ''',
            'dst_names': ['YT_TMP_TABLE'],
            'dst_fields': [{'snorm': str,
                            'norm': str,
                            'freq': int}]
        }

        bmyt_cl.run_bm_map(
            bm_mapper,
            src_count_table,
            tmp_table,
        )

        yql_query = '''
        INSERT INTO `{output_table}` WITH TRUNCATE
        SELECT
            norm,
            snorm
        FROM `{input_table}`
        WHERE
            norm != snorm
        '''.format(input_table=tmp_table,
                   output_table=dst_snorm_table)

        bmyt_cl.do_yql(yql_query, title='filter snorms', transaction_id=tx.transaction_id)

        def reducer(key, recs):
            full_freq = 0
            norm_freq = dict()

            for rec in recs:
                full_freq += rec['freq']
                norm = rec['norm']
                freq = rec['freq']
                if norm not in norm_freq:
                    norm_freq[norm] = freq
                else:
                    norm_freq[norm] += freq

            sorted_norms = norm_freq.keys()
            sorted_norms.sort()

            yield {'snorm': key['snorm'],
                   'full_freq': full_freq,
                   'norm_freq': ','.join([_norm+':'+str(norm_freq[_norm]) for _norm in sorted_norms])}

        bmyt_cl.yt_client.run_map_reduce(
            None,
            reducer,
            tmp_table,
            dst_syns_table,
            reduce_by=['snorm']
        )

        bmyt_cl.yt_client.run_sort(dst_syns_table, sort_by=['snorm'])
