#!/usr/bin/env python2
# -*- coding: utf-8 -*-
import re
import os
import sys
from collections import deque
import subprocess
from datetime import datetime, timedelta


dicts_path = 'WORKING_DIR/rules'
dict_file_codepage = 'koi8-r'


def get_available_dicts(path):
    file_names = filter(lambda x: re.match(r'^dict_', x), os.listdir(path))
    return file_names

def load_dicts():
    dictionaries = {}  # name : dict
    dict_names = get_available_dicts(dicts_path)
    for dname in dict_names:
        res = set()
        dfile = open(os.path.join(dicts_path,dname))
        for line in dfile:
            if line[0] == '#':
                continue
            line = line.decode(dict_file_codepage).lower()
            line = line.strip().lower()
            if(line):
                res.add(line)
        dictionaries[dname] = res
    return dictionaries


def get_bigrams(letter):
    prev_word = ''
    bigrams = []
    #splitter = re.compile(r'\W*', re.U)
    splitter = re.compile(u'[^0-9a-zA-Zа-яА-ЯЁё]+', re.U)
    for word in splitter.split(letter):
        if( (len(word) >= 3) and not word.isdigit()):
            word = word.lower().strip()
            if(prev_word):
                bigrams.append(prev_word + '_' + word)
            prev_word = word
        else:
            prev_word = ''
    return set(bigrams)


def append_ngrams(words, res):
    ngram = ''
    for i in range(len(words)):
        if(words[i]):
            if(ngram):
                ngram += '_'
            ngram += words[i]
            res[i + 1].add(ngram)
        else:
            break


def get_ngrams(letter):
    res = {}
    min_word_len = 3
    word_splitter = re.compile(u'[^0-9a-zA-Zа-яА-ЯЁё]+', re.U)
    max_phrase_len = 4
    for i in range(max_phrase_len):
        res[i + 1] = set()
    words = deque(4*[''], maxlen=max_phrase_len)
    for word in word_splitter.split(letter):
        if( (len(word) >= min_word_len) and not word.isdigit()):
            words.append(word)
            append_ngrams(words, res)
    while words:
        words.popleft()
        append_ngrams(words, res)
    return res


def get_ngram_intersecton(ngrams):
    dictionaries = load_dicts()
    res = {}
    for (size, dictionary) in ngrams.items():
        res[size] = {'dicts' : {}, 'not_found' : []}
        for phrase in dictionary:
            match_found = False
            for (dname, dset) in dictionaries.items():
                if not res[size]['dicts'].has_key(dname):
                    res[size]['dicts'][dname] = []
                if phrase in dset:
                    match_found = True
                    res[size]['dicts'][dname].append(phrase)
            if not match_found:
                res[size]['not_found'].append(phrase)
        n_sign = n_skip = 0
        n_total = len(dictionary)
        for (dname, d) in res[size]['dicts'].items():
            if 'sign' in dname:
                n_sign = len(d)
            elif 'skip' in dname:
                n_skip = len(d)

        if((n_total - n_skip == 0) and (n_sign == 0)):
            res[size]['perc'] = 'lim<sub>x-->0</sub>(x/x)'
        elif n_total - n_skip == 0:
            res[size]['perc'] = str(n_sign) + ' / 0'
        else:
            res[size]['perc'] = "%.2f%%" % (100. * n_sign / (n_total - n_skip))
    return res
