__author__ = 'aalogachev'

import logging
import collections
import ConfigParser
from ru.yandex import utils

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

logger = logging.getLogger('statistics.queries_stat')

config = ConfigParser.SafeConfigParser()
config.readfp(open( utils.get_project_path() + '/config.json'))
section = config.defaults()['active_section']
logger.debug('Using config section =' + section)
work_dir = config.get(section, 'work_dir')
logger.debug("Using work dir = " + work_dir)
queries_file = config.get(section, 'queries_file')
stat_file = config.get(section, 'stat_file')
subqueries_stat_file = config.get(section, 'subqueries_stat_file')

MIN_SUBQUERY_COUNT = 50
find_freq_patterns = False

def load_queries_and_counts(in_file, query_pos, count_pos = None):
    queries_count = collections.Counter()
    words_count = dict()
    lines_count = 0
    for line in open(in_file):
        fields =line.rstrip().split('\t')
        query = fields[query_pos]
        if (count_pos is not None):
            count = fields[count_pos]
        queries_count[query]+=count
        words = query.split()
        for word in words:
            data = words_count.get(word)
            if data is None:
                data = [0, collections.Counter()]
                words_count[word] = data
            data[0]+=count
            data[1][query]+=count
        lines_count+=1
        if lines_count%100 == 0:
            logger.info ("Lines processed = " + str(lines_count))
    return (queries_count, words_count)

def remove_inplace_by_min_count(some_count):
    for k,v in some_count.items():
        if v[0] < MIN_SUBQUERY_COUNT: del some_count[k]
    return some_count

def next_level(words_count, current_level_count):
    total_count = 0
    new_level_count = dict()
    for subquery in current_level_count:
        sub_data = current_level_count[subquery]
        for cur_word in words_count.keys():
            sub_words = subquery.split()
            if cur_word<=sub_words[-1]: continue
            new_data = [0, dict()]
            for query, counts in sub_data[1].items():
                if cur_word in query.split():
                    new_data[0]+=counts
                    new_data[1][query]=counts
            if new_data[0]>=MIN_SUBQUERY_COUNT:
                total_count+=1
                new_sub = subquery + ' ' + cur_word
                new_level_count[new_sub] = new_data
                #print str(total_count) +"\t" + new_sub + "\t" + str(new_data[0])
    return new_level_count

def print_counts(some_counts):
    for some_obj in some_counts:
        logger.info(some_obj + " " +str(some_counts[some_obj][0]))

def store_to_file(some_counts, writable):
    for some_obj in some_counts:
        writable.write(str(some_counts[some_obj][0]) + '\t' + some_obj + '\n')


outstat = open(stat_file, mode='w')
queries_count, words_count = load_queries_and_counts(queries_file, 0, 1)


#ordering queries by count
ordered_stat = collections.OrderedDict(sorted(queries_count.items(), key=lambda t: t[1]))
for query in ordered_stat:
    outstat.write(str(ordered_stat[query]) + '\t' + query + '\n')

#removing words with less than MIN_COUNT
words_count = remove_inplace_by_min_count(words_count)

#initital seeding
current_level = words_count
new_subqueries_count = len(current_level)
print_counts(current_level)
logger.info("Total words = " + str(new_subqueries_count))
store_to_file(current_level, outstat)

if find_freq_patterns:
    while (new_subqueries_count!=0):
        current_level = next_level(words_count, current_level)
        new_subqueries_count = len(current_level)
        print_counts(current_level)
        store_to_file(current_level, outstat)
        logger.info("New level count = " + str(new_subqueries_count))

#closing file
outstat.close()

