#!/usr/bin/python
# encoding: utf-8
# kate: space-indent on; indent-width 4; replace-tabs on;
#
from collections import defaultdict

class Clusterizer:
    def __init__( self ):
        self.basedocs = []
        self.docs = defaultdict( list )
        self.scoring = defaultdict( set )
        self._count = 0

    def _addToCluster( self, index, doc, extra ):
        self.docs[ index ].append( [ doc, extra ] )

    def _addCluster( self, doc, extra ):
        self.basedocs.append( doc )
        index = len( self.basedocs ) - 1
        self._addToCluster( index, doc, extra )
        for i, value in doc.iteritems():
            if value == 1:
                self.scoring[ i ].add( index )

    def checkDoc( self, doc, extra ):
        self._count += 1
        scores = defaultdict( int )
        l = sum( doc.values() )
        for i, value in doc.iteritems():
            if value == 1:
                for index in self.scoring[ i ]:
                    scores[ index ] += 1
        if len( scores ) > 0:
            cluster_index = -1
            for ( index, count ) in scores.iteritems():
                bd_len = sum( self.basedocs[ index ].values() )
                if max( l - count, bd_len - count ) < 3:
                    cluster_index = index
                    break
            if cluster_index != -1:
                self._addToCluster( cluster_index, doc, extra )
            else:
                self._addCluster( doc, extra )
        else:
            self._addCluster( doc, extra )

    def getDocsCount( self ):
        return self._count
