import yt.wrapper
import sys
import urlparse
import re
import urllib
import urllib2
import math
import json
import time
import datetime
from datetime import date, datetime, timedelta
import argparse
import ast

def get_ui(key, records):
    total = 0.
    for record in records:
        ui = record['ui']
        total += 1
    yield {
        'ui' : ui,
        'total' : total
    }

def group_by_ui(key, recs):
    queries = []
    need = False
    ui_0_dayuse = 0.
    for rec in recs:
        if "total" in rec:
            need = True
        else:
            if rec["dayuse"] == '0':
                queries.append(rec["user_input"])
    if need is True:
        for query in queries:
            yield {
                'query' : query,
            }

def group_by_query(key,recs):
    total = 0.
    for rec in recs:
        action = rec["query"]
        total += 1
    yield {
        'query' : action,
        'total' : total
    }

def classify_queries(key, records):
    suggestions = None
    query = None
    total = 0
    for record in records:
        if 'suggestions' in record:
            suggestions = record['suggestions']
        else:
            query = record['query']
            total = record['total']
    if suggestions is not None and query is not None:
        suggestions = ast.literal_eval(suggestions)
        suggestions.pop('unknown', None)
        if len(suggestions) == 0:
            suggestion = 'unknown'
        else:
            suggestion = sorted(suggestions.keys(), key = lambda key: suggestions[key])[-1]
        yield {
            'theme' : suggestion,
            'query' : query,
            'total' : total
            }

def parse_args():
    parser = argparse.ArgumentParser(add_help=True, description='analyse yastroka queries')
    parser.add_argument('--timestamp', help='date timestamp for calculation')
    parser.add_argument('--from_date', default='2017-02-15',  help='from date for calculation (format: YYYY-MM-DD)')
    parser.add_argument('--to_date', default='2017-02-15', help='to date for calculation (format: YYYY-MM-DD)')
    parser.add_argument("--bs", dest="blockstat", help="path to blockstat.dict", default='/home/galamaj/blockstat.dict', required=False)
    # parser.add_argument('--mr_server', default='hahn', help='MR server (hahn, banach, ...)')
    # parser.add_argument('--yt_pool', default='robot-suggestor-dev', help='YT pool')
    args = parser.parse_args()
    return args

def get_dates(timestamp, from_date, to_date):
    dates = []
    date_format = '%Y-%m-%d'
    if timestamp:
        date = datetime.fromtimestamp(int(timestamp[:10]))
        dates.append(date.strftime(date_format))
    if from_date and not to_date:
        to_date = datetime.strftime(datetime.now(), date_format)
    if from_date and to_date:
        current_date = datetime.strptime(from_date, date_format)
        while current_date <= datetime.strptime(to_date, date_format):
            dates.append(current_date.strftime(date_format))
            current_date += timedelta(1)
    return dates

def calc(date):
    inputTable = '//home/suggest-dev/galamaj/yastroka/' + date
    outputTable = '//home/suggest-dev/galamaj/yastroka/actions_unique_ui/classified_queries_dayuse_1'
    inputTable2 = '//home/suggest-dev/galamaj/yastroka/preparates/2017-02-15'
    inputTable3 = '//home/search-research/queries-markup'
    yt.wrapper.config['memory_limit'] = 100 * 1024 * 1024 * 1024
    yt.wrapper.create("table", path = outputTable, recursive = True, ignore_existing = True, attributes = None)
    outputPath = yt.wrapper.TablePath(outputTable) #, append = True
    print "%s   %s -> %s" % (str(datetime.now()), inputTable, outputTable)
    #yt.wrapper.run_map(mapper, inputTable, outputPath, format = yt.wrapper.JsonFormat())
    #yt.wrapper.run_map_reduce(mapper, group_by_action, inputTable, outputPath, reduce_by='dayuse', format = yt.wrapper.JsonFormat())
    with yt.wrapper.TempTable('//home/suggest-dev/galamaj/tmp', prefix='ui') as tmpTableUI,\
        yt.wrapper.TempTable('//home/suggest-dev/galamaj/tmp', prefix='actions') as tmpTable2,\
        yt.wrapper.TempTable('//home/suggest-dev/galamaj/tmp', prefix='queries') as tmpTable3:
        yt.wrapper.run_sort(inputTable,sort_by=['ui'])
        yt.wrapper.run_reduce(get_ui, inputTable, tmpTableUI, reduce_by='ui', format = yt.wrapper.JsonFormat())
        yt.wrapper.run_sort(tmpTableUI,sort_by=['ui'])
        yt.wrapper.run_sort(inputTable2,sort_by=['ui'])
        yt.wrapper.run_reduce(group_by_ui, [tmpTableUI, inputTable2], tmpTable2, reduce_by="ui", format = yt.wrapper.JsonFormat())
        yt.wrapper.run_sort(tmpTable2,sort_by=['query'])
        yt.wrapper.run_reduce(group_by_query, tmpTable2, tmpTable3, reduce_by="query", format = yt.wrapper.JsonFormat())
        yt.wrapper.run_sort(tmpTable3,sort_by=['query'])
        yt.wrapper.run_reduce(classify_queries, [tmpTable3, inputTable3], outputPath, reduce_by="query", format = yt.wrapper.JsonFormat())


if __name__ == '__main__':
    args = parse_args()
    dates = get_dates(args.timestamp, args.from_date, args.to_date)
    print dates
    for date in dates:
        calc(date)





