#!/usr/bin/env python

import argparse
import json
import os
import re

from yt.wrapper import YtClient


def external_cdict_mapper(namespace):
    correct_phrase_pattern = re.compile(r'^[^,:]+:[\d\.]+(?::\d+)?$')

    def do_map(rec):
        # check BannerID
        try:
            int(rec['key'])
        except:
            return

        # check phraselist
        phrases = filter(lambda phr: correct_phrase_pattern.match(phr), rec['value'].split('\t'))
        phrases = phrases[:1000]
        if not phrases:
            return

        yield {
            'cdict_namespace': namespace,
            'cdict_key': rec['key'],
            'cdict_value': ','.join(phrases),
        }
    return do_map


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--external-config-json', required=True)
    parser.add_argument('--dst-cdict-table', required=True)
    args = parser.parse_args()

    yt_config = {
        'proxy': {'url': 'hahn'},
        'token': os.environ.get('YT_TOKEN'),
        'pickling': {
            'force_using_py_instead_of_pyc': True,
            'module_filter': lambda module: hasattr(module, '__file__') and not module.__file__.endswith('.so'),
        }
    }
    yt = YtClient(config=yt_config)

    external_config = json.loads(args.external_config_json)

    with yt.TempTable() as tmp:
        for ext_candidate in external_config:
            if ext_candidate.get('deleted'):
                continue

            source_name = ext_candidate['source_name']
            mr_server = ext_candidate['mr_server']
            if not mr_server.startswith('hahn'):
                print "WARN: candidate {} is not on hahn, skip it".format(source_name)
                continue

            yt.run_map(
                external_cdict_mapper(source_name),
                ext_candidate['mr_table'],
                yt.TablePath(tmp, append=True)
            )

        yt.run_sort(tmp, sort_by=['cdict_namespace', 'cdict_key'])
        yt.copy(tmp, args.dst_cdict_table, force=True, recursive=True)


if __name__ == '__main__':
    main()
