# -*- coding: utf-8 -*-
import re

import sys
sys.path.append('../lib')
sys.path.append('../pylib')

import yt.wrapper as yt

class Mapper():
    def __call__(self, row):
        if (not 'title' in row) or (not 'body' in row):
            return
        title_body = row['title']+" "+row['body']
        title_body = title_body.decode('utf-8')
        title_body = re.sub(r'(?u)\W+', ' ', title_body.lower())
        list = ['не','без','никакой','никакая','никакое','никакие','никакого','никакой','никаких','никакому','никаким','нет','нельзя','невозможно']
        list = [i.decode('utf-8') for i in list]
        title_body_words = title_body.split(' ')
        wordcount = len(title_body_words)
        if wordcount<1:
            return
        for i in range(0,wordcount):
            if title_body_words[i] in list:
                for dl in range(0,6):
                    left = i-dl
                    if left < 0:
                        break
                    for dr in range(1,7):
                        right = i+dr
                        if right > wordcount:
                            break
                        s = ' '
                        if right - left > 1:
                            yield {
                                'phrase': s.join(title_body_words[left:right]),
                                'uid': row['uid']
                            }

class Reducer():
    def __call__(self, key, rows):
        uids = set()
        for row in rows:
            uids.add(row["uid"])
        if len(uids) > 1:
            yield {
                'phrase': key['phrase'],
                'uids_count': len(uids),
            }

def main():
    yt.config['mount_sandbox_in_tmpfs'] = True
    yt.config['token_path'] = '/opt/broadmatching/bm-tokens/yt_plato'
    yt.config['spec_defaults'] = {
        'pool': 'catalogia',
    }
    yt.config["proxy"]["url"] = "hahn.yt.yandex.net"

    banners = '//home/direct/export/bm/bm_banners'
#    banners = '//home/catalogia/tmp/categories_bids_extended718507'
    yt.run_map_reduce(
        Mapper(),
        Reducer(),
        banners,
        '<schema = <strict=%false>[{name = phrase; type = string };{name = uids_count; type = int64 }]>//home/catalogia/tmp/negations',
        reduce_by = ['phrase'],
    )

    yt.run_sort('//home/catalogia/tmp/negations', sort_by=['uids_count']);

if __name__ == '__main__':
    main()
