__author__ = 'aalogachev'

import os, logging, ConfigParser
from ru.yandex import utils
from subprocess import call
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

config = ConfigParser.SafeConfigParser()
config.readfp(open(os.path.join(utils.get_project_path(),'config.json')))
section = config.defaults()['active_section']
logger.debug('Using config section =' + section)
work_dir = config.get(section=section, option='work_dir')
logger.debug("Using work dir = " + work_dir)
suggest_file = config.get(section, 'suggest_file')
dictionary_file = config.get(section, 'dictionary_file')
temp_file = config.get(section, 'temp_queries_file')
temp_file_2 = temp_file + '.temp'

remove_tmp_files = True

in_queries_file = open(suggest_file, mode='r')

prev_query = None
count = 0
line_counter = 0

temp_queries_file = open(temp_file, mode='w')
for line in in_queries_file:
    tokens = line.split('\t')
    line_counter += 1

    query = tokens[1]
    if (query == ''):
        #means no suggest for this phrase
        query = tokens[0]

    count = tokens[2]

    temp_queries_file.write(u'{0}\t{1}\n'.format(query, count).encode('UTF-8'))

    if (line_counter % 1000 == 0):
        logger.info(u'Phase 1: Processed {0} lines'.format(line_counter))

in_queries_file.close()
temp_queries_file.close()

#sorting queries
logger.info('Sorting queries from {0} to {1}'.format(temp_file, temp_file_2))
call(['sort', temp_file, "-o", temp_file_2])
if remove_tmp_files: call(['rm', temp_file])

line_counter = 0
temp_queries_file_2 = open(temp_file_2, mode='r')
out_queries_file = open(dictionary_file, mode = 'w')
for line in temp_queries_file_2:
    tokens = line.split('\t')
    line_counter += 1

    if (tokens[0] == prev_query and prev_query is not None):
        count += int(tokens[1])
    else:
        if prev_query is not None:
            out_queries_file.write(u'{0}\t{1}\n'.format(prev_query, count).encode('UTF-8'))
        prev_query = tokens[0]
        count = int(tokens[1])

    if (line_counter % 1000 == 0):
        logger.info(u'Phase 2. Processed {0} lines'.format(line_counter))

out_queries_file.write(u'{0}\t{1}'.format(prev_query, count).encode('UTF-8'))

temp_queries_file_2.close()
out_queries_file.close()

if remove_tmp_files: call(['rm', temp_file_2])

