import argparse
import os
from functools import reduce
from yt.wrapper import YtClient


def merge_urls(a, b):
    a = a.copy()
    for url, value in b.items():
        if url in a:
            used = set(a[url]['attributes'])
            a[url]['attributes'].extend(x for x in value['attributes'] if x not in used)
        else:
            a[url] = value
    return a


def merge_data(a, b):
    data = {
        'pron': a.get('pron', []) + b.get('pron', []),
        'relev': a.get('relev', []) + b.get('relev', []),
        'rearr': a.get('rearr', []) + b.get('rearr', []),
        'glob': a.get('glob', []) + b.get('glob', []),
        'urls': merge_urls(a['urls'], b['urls'])
    }
    if 'conditionFlag' in a:
        data['conditionFlag'] = a['conditionFlag']
    return data


def merge_by_condition(a, b):
    if 'urls' not in a:
        if 'urls' not in b:
            for condition, data in a.items():
                if condition in b:
                    a[condition] = merge_data(data, b[condition])
            for condition, value in b.items():
                if condition not in a:
                    a[condition] = data
        else:
            if 'conditionFlag' in b:
                condition = b['conditionFlag']
            else:
                condition = 'default'
            if condition in a:
                a[condition] = merge_data(a[condition], b)
            else:
                a[condition] = b
        return a
    else:
        if 'urls' not in b:
            if 'conditionFlag' in a:
                condition = a['conditionFlag']
            else:
                condition = 'default'
            if condition in b:
                b[condition] = merge_data(b[condition], a)
            else:
                b[condition] = a
            return b
        else:
            if 'conditionFlag' in a:
                acondition = a['conditionFlag']
            else:
                acondition = 'default'
            if 'conditionFlag' in b:
                bcondition = b['conditionFlag']
            else:
                bcondition = 'default'
            if acondition == bcondition:
                c = {acondition : merge_data(a, b)}
            else:
                c = {acondition : a, bcondition : b}
            return c


def deduplicate(key, rows):
    yield reduce(lambda a, b: {
        'tld': key['tld'],
        'query': key['query'],
        'data': merge_by_condition(a['data'], b['data'])
    }, rows)


def parse_args():
    parser = argparse.ArgumentParser()

    parser.add_argument('--yt-proxy', required=True)
    parser.add_argument('--yt-token', default=os.environ.get('YT_TOKEN'))
    parser.add_argument('--src-directory', required=True)
    parser.add_argument('--dst-table', required=True)

    return parser.parse_args()


def main():
    args = parse_args()

    yt = YtClient(args.yt_proxy, args.yt_token, config={
        'pickling': {
            'python_binary': '/skynet/python/bin/python'
        }
    })

    input_tables = yt.list(args.src_directory, absolute=True)
    yt.run_reduce(deduplicate,  source_table=input_tables, destination_table=args.dst_table, reduce_by=['tld', 'query'])


if __name__ == '__main__':
    main()
