import argparse
import string
import os.path

import yt.wrapper as yt


def parse_args():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--yt-proxy', default=None, help='YT proxy')
    parser.add_argument('--blacklist-file', nargs='*', required=True, help="Path to file with blacklist")
    parser.add_argument('--src', required=True, help="Path to source table")
    parser.add_argument('--word-column', nargs='*', help="Name of column with the word")
    parser.add_argument('--dst', required=True, help="Path to destination table")
    parser.add_argument('--memory-limit', type=int, default=1024**3, help="Memory limit for jobs")
    return parser.parse_args()


def is_allowed(blacklist, word):
    assert(type(word) == str)
    word = word.lower().strip()
    if word in blacklist:
        return False

    is_sep = lambda c: c in string.punctuation
    clean_word = ''.join(' ' if is_sep(c) else c for c in word)

    if clean_word.replace(' ', '') in blacklist:
        return False

    for subword in clean_word.split():
        if subword in blacklist:
            return False

    return True


@yt.aggregator
class MapFilter(object):
    def __init__(self, filenames, columns):
        self.filenames = filenames
        self.columns = columns

    def __call__(self, recs):
        blacklist = set()
        for filename in self.filenames:
            with open(filename) as fin:
                for s in fin.readlines():
                    s = s.strip().lower()
                    if s:
                        assert(type(s) == str)
                        blacklist.add(s)
        for rec in recs:
            if all(is_allowed(blacklist, rec[column].decode('utf-8')) for column in self.columns):
                yield rec


def main():
    args = parse_args()

    if args.yt_proxy:
        yt.config["proxy"]["url"] = args.yt_proxy

    mapper = MapFilter(list(map(os.path.basename, args.blacklist_file)),
                       list(map(lambda w: w.encode('utf-8'), args.word_column)))
    print(args.blacklist_file)
    yt.run_map(mapper,
               args.src,
               args.dst,
               memory_limit=args.memory_limit,
               format=yt.YsonFormat(encoding=None),
               local_files=args.blacklist_file)


if __name__ == '__main__':
    main()
