#!/usr/bin/python
import re

import sys
import os
import shutil
import datetime
from tempfile import mkstemp
import hashlib

from bm.yt_tools import NormalizeMapper
import yt.wrapper as yt

class Mapper(NormalizeMapper):
    def __init__(self):
        super(Mapper, self).__init__(local_files=True)
    def __call__(self, row):
        phrases_text = row['phrases']
        phrases = phrases_text.split(',')
        fltphrases = []
        for phrase in phrases:
            fltphrases.append(self.norm_phr(re.sub(r' *(~0)?:.*$', '', phrase)))
        fltphrases.sort()
        m = hashlib.md5()
        for phrase in fltphrases:
            m.update(phrase)
        yield {
            'uid': row['uid'],
            'count': len(fltphrases),
            'md5': m.digest(),
        }

class Reducer():
    def __call__(self, key, rows):
        uids = {}
        for row in rows:
            uids[row['uid']] = 1
        if len(uids.keys()) > 1:
            for uid in uids:
                yield {
                    'uid': uid,
                    'phrcount': key['count'],
                }

def main():

    yt.config['mount_sandbox_in_tmpfs'] = True
    yt.config['token_path'] = '/opt/broadmatching/bm-tokens/yt_plato'
    yt.config['spec_defaults'] = {
        'pool': 'catalogia',
    }
    yt.config["proxy"]["url"] = "hahn.yt.yandex.net"

    yt.run_map_reduce(
        Mapper(),
        Reducer(),
        '//home/direct/export/bm/bm_banners',
        '//home/catalogia/tmp/count_doubles',
        reduce_by=['md5', 'count'],
        map_local_files=[Mapper().files],
    )

if __name__ == '__main__':
    main()
