#!/usr/bin/python
# -*- coding: utf-8 -*-

#упрощенный вариант кластеризации баннеров с помощью расстояния Левенштейна

import re
import yt.wrapper as yt

SIM_MIN = 0.70 #min степень сходства элементов кластера

def similar(a, b):
    n, m = len(a), len(b)
    if n > m:
        a, b = b, a
        n, m = m, n
    current_row = range(n+1)
    for i in range(1, m+1):
        previous_row, current_row = current_row, [i]+[0]*n
        for j in range(1,n+1):
            add, delete, change = previous_row[j]+1, current_row[j-1]+1, previous_row[j-1]
            if a[j-1] != b[i-1]:
                change += 1
            current_row[j] = min(add, delete, change)

    #return current_row[n]
    return '%.3f' % (1 - float(2 * current_row[n]) / (len(a) + len(b)))


def prepare(rec):
    L = rec['bnorm'].split()
    wrds = set(L) #удаляем дубли
    L = []
    for wrd in wrds:
        if re.findall('^[0-9]+$', wrd) or re.findall('^_', wrd):
            continue
        L.append(wrd)
    L.sort()

    if len(L) > 0:
        clast = " ".join(L)
        yield { "mctgs": rec['mctgs'], "clast": clast, "bid": rec['bid'] }


def clast(key, recs):
    CL = []
    for rec in recs:
        flag = 0
        for cl in CL:
            sim = float(similar(rec['clast'], cl['clast'])) #сходство кластеров
            if sim >= SIM_MIN:
                flag = 1
                rec['clast_sup'] = cl['clast'] #включающий кластер
                rec['sim'] = sim
                yield rec
                break

        if flag == 0:
            CL.append(rec)


def main():
    tab1 = '//home/catalogia/users/yuryz/bnrs_norm'
    tab2 = '//tmp/yuryz/clast1'

    yt.run_map(prepare, tab1, tab2)
    yt.run_sort(tab2, sort_by=['mctgs', 'clast', 'bid'])

    tab3 = '//tmp/yuryz/clast2'

    #yt.run_reduce(clast, tab2, tab3, reduce_by = ['mctgs'])

    print yt.row_count(tab1)
    print yt.row_count(tab2)


if __name__ == '__main__':
    main()
