import yt.wrapper
import sys
import urlparse
import re
import urllib
import urllib2
import math
import json
import time
import datetime
from datetime import date, datetime, timedelta
import argparse
import ast

def group_by_query(key, records):
    total = 0.
    for record in records:
        query = record['user_input']
        total += 1
    yield {
        'query' : query,
        'total' : total
    }

def classify_queries(key, records):
    suggestions = None
    query = None
    total = 0
    for record in records:
        if 'suggestions' in record:
            suggestions = record['suggestions']
        else:
            query = record['query']
            total = record['total']
    if suggestions is not None and query is not None:
        suggestions = ast.literal_eval(suggestions)
        suggestions.pop('unknown', None)
        if len(suggestions) == 0:
            suggestion = 'unknown'
        else:
            suggestion = sorted(suggestions.keys(), key = lambda key: suggestions[key])[-1]
        yield {
            'theme' : suggestion,
            'query' : query,
            'total' : total
            }

def parse_args():
    parser = argparse.ArgumentParser(add_help=True, description='analyse yastroka queries')
    parser.add_argument('--timestamp', help='date timestamp for calculation')
    parser.add_argument('--from_date', default='2017-03-01',  help='from date for calculation (format: YYYY-MM-DD)')
    parser.add_argument('--to_date', default='2017-03-07', help='to date for calculation (format: YYYY-MM-DD)')
    parser.add_argument("--bs", dest="blockstat", help="path to blockstat.dict", default='/home/galamaj/blockstat.dict', required=False)
    # parser.add_argument('--mr_server', default='hahn', help='MR server (hahn, banach, ...)')
    # parser.add_argument('--yt_pool', default='robot-suggestor-dev', help='YT pool')
    args = parser.parse_args()
    return args

def get_dates(timestamp, from_date, to_date):
    dates = []
    date_format = '%Y-%m-%d'
    if timestamp:
        date = datetime.fromtimestamp(int(timestamp[:10]))
        dates.append(date.strftime(date_format))
    if from_date and not to_date:
        to_date = datetime.strftime(datetime.now(), date_format)
    if from_date and to_date:
        current_date = datetime.strptime(from_date, date_format)
        while current_date <= datetime.strptime(to_date, date_format):
            dates.append(current_date.strftime(date_format))
            current_date += timedelta(1)
    return dates

def calc(date):
    inputTable = '//home/suggest-dev/galamaj/yastroka/' + date
    outputTable = '//home/suggest-dev/galamaj/yastroka/classified_queries/' + date
    inputTable2 = '//home/search-research/queries-markup'
    yt.wrapper.config['memory_limit'] = 100 * 1024 * 1024 * 1024
    yt.wrapper.create("table", path = outputTable, recursive = True, ignore_existing = True, attributes = None)
    outputPath = yt.wrapper.TablePath(outputTable) #, append = True
    print "%s   %s -> %s" % (str(datetime.now()), inputTable, outputTable)
    #yt.wrapper.run_map(mapper, inputTable, outputPath, format = yt.wrapper.JsonFormat())
    #yt.wrapper.run_map_reduce(mapper, group_by_action, inputTable, outputPath, reduce_by='dayuse', format = yt.wrapper.JsonFormat())
    with yt.wrapper.TempTable('//home/suggest-dev/galamaj/tmp', prefix='unique_queries') as tmpTable:
        yt.wrapper.run_sort(inputTable,sort_by=['user_input'])
        yt.wrapper.run_reduce(group_by_query, inputTable, tmpTable, reduce_by='user_input', format = yt.wrapper.JsonFormat())
        yt.wrapper.run_sort(tmpTable,sort_by=['query'])
        yt.wrapper.run_reduce(classify_queries, [tmpTable, inputTable2], outputPath, reduce_by="query", format = yt.wrapper.JsonFormat())


if __name__ == '__main__':
    args = parse_args()
    dates = get_dates(args.timestamp, args.from_date, args.to_date)
    print dates
    for date in dates:
        calc(date)




