import codecs
import sys
import os
import json
import re
import argparse
import logging

logging.basicConfig(format='%(filename)s[LINE:%(lineno)d]# %(levelname)-8s [%(asctime)s]  %(message)s', level=logging.DEBUG)

def extract_queryfresh_from_basket_item(item):
    if 'params' not in item:
        return None
    for param in item['params']:
        if param['name'] == 'queryfresh':
            return param['value']
    return None


def extract_basket_part_from_basket_item(item):
    if 'params' not in item:
        return None
    for param in item['params']:
        if param['name'] == 'basket_part':
            return param['value']
    return None


def set_query_date_to_basket(item, query_date):
    if 'params' not in item:
        return None
    old_value = None
    for param in item['params']:
        if param['name'] == 'query_date':
            old_value = param['value']
            param['value'] = query_date
            break
    return old_value


def create_argument_parser():
    parser = argparse.ArgumentParser()

    parser.add_argument('--basket', required=True)
    parser.add_argument('--output-queries', required=True)
    parser.add_argument('--output-serpset', required=True)
    parser.add_argument('--mode', required=True)

    return parser


def freon_v2_custom_join(args, serpsets_dir='serpsets'):
    DEVICE_MAP = {
                    0: "DESKTOP",
                    1: "ANDROID",
                    2: "IPHONE",
                    3: "UNKNOWN",
                    4: "WINDOWS_PHONE"
                }

    logging.info('Loading basket')
    with codecs.open(args.basket, 'r', 'utf8') as f:
        basket = json.load(f, encoding='utf8')

    logging.info('Converting basket to dict')
    basket_dict = {(query['text'], query['regionId'], query['device']): query for query in basket}

    logging.info('Getting serpset filenames')
    serpset_filenames = []
    for fname in os.listdir(serpsets_dir):
        if re.match(r'^[0-9]+\.json$', fname):
            serpset_filenames.append((os.path.join(serpsets_dir, fname), int(fname.split('.')[0])))
    serpset_filenames.sort(key=lambda x: x[1])
    logging.info('Found: {}'.format(json.dumps(serpset_filenames)))

    out_serpset_dict = {}
    basket_seen = set()

    logging.info('Reading serpsets')
    for fname, _ in serpset_filenames:
        logging.info('Loading serpset {}'.format(fname))
        with codecs.open(fname, 'r', 'utf8') as f:
            cur_serpset = json.load(f, encoding='utf8')

        for serp in cur_serpset:
            if 'query' not in serp or 'components' not in serp:
                continue
            query = serp['query']
            query_key = (query['text'], query['regionId'], DEVICE_MAP.get(query['device'], 'UNKNOWN'))
            if query_key not in basket_dict:
                continue
            if query_key in out_serpset_dict:
                continue
            if query_key in basket_seen:
                continue
            query_key_str = '({}, {}, {})'.format(query_key[0].encode('utf8'), query_key[1], query_key[2])
            logging.info('{} is taken from {}'.format(query_key_str, fname))

            if args.mode == "take_date_from_basket":
                query_date = serp['serp_query_param.query_date']
                old_query_date = set_query_date_to_basket(basket_dict[query_key], query_date)
                logging.info('query_date substitution {} -> {}'.format(old_query_date, query_date))

                basket_seen.add(query_key)
            else:
                basket_queryfresh = extract_queryfresh_from_basket_item(basket_dict[query_key])
                logging.info('queryfresh substitution {} -> {}'.format(serp.get('serp_query_param.queryfresh'), basket_queryfresh))
                serp['serp_query_param.queryfresh'] = basket_queryfresh

                basket_part = extract_basket_part_from_basket_item(basket_dict[query_key])
                logging.info('basket_part substitution {} -> {}'.format(serp.get('serp_query_param.basket_part'), basket_part))
                serp['serp_query_param.basket_part'] = basket_part

                out_serpset_dict[query_key] = serp

    out_queries = []
    out_serpset = []

    logging.info('Splitting serps/basket for output')
    for query_key in basket_dict:
        if query_key in out_serpset_dict:
            out_serpset.append(out_serpset_dict[query_key])
        else:
            out_queries.append(basket_dict[query_key])

    logging.info('Total {}/{} serps reused'.format(len(out_serpset), len(basket_dict)))

    logging.info('Appending serp reused labels')
    for serp in out_serpset:
        serp['text.fresh_serp_source'] = 'reused'

    logging.info('Saving output queries')
    with codecs.open(args.output_queries, 'w', 'utf8') as f:
        json.dump(out_queries, f, indent=2, ensure_ascii=False)

    logging.info('Saving output serpset')
    with codecs.open(args.output_serpset, 'w', 'utf8') as f:
        json.dump(out_serpset, f, indent=2, ensure_ascii=False)


def main():
    args = create_argument_parser().parse_args()
    freon_v2_custom_join(args)


if __name__ == '__main__':
    main()
