#!/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse
import yt.wrapper as yt

CYRILLIC_LETTERS = u'абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'

LANGUAGE_IDS = {
    'en': ['en', 'eng'],
    'ru': ['ru', 'rus']
}

def parse_args():
    parser = argparse.ArgumentParser(description='Filter images by language.')
    parser.add_argument('--yt-proxy', help='YT proxy.')
    parser.add_argument('-i', '--input-table', required=True, help='Input table path.')
    parser.add_argument('-o', '--output-table', required=True, help='Output table path.')
    parser.add_argument('--keep-lang', required=True, choices=LANGUAGE_IDS.keys(), help='Language of images to keep.')
    return parser.parse_args()

class Mapper(object):
    def __init__(self, keep_lang):
        self.keep_lang = keep_lang

    def __call__(self, record):
        if "Language" not in record:
            return
        if record["Language"] not in LANGUAGE_IDS[self.keep_lang]:
            return
        if self.keep_lang in LANGUAGE_IDS["en"]:
            if "WordGT" in record:
                recognition = record["WordGT"]
            elif "Recognition" in record:
                recognition = record["Recognition"]
            elif "OCRRecognition" in record:
                recognition = record["OCRRecognition"]
            else:
                return
            if any(c in CYRILLIC_LETTERS for c in recognition.decode('utf-8')):
                return
        yield record

def main():
    args = parse_args()

    yt.config["proxy"]["url"] = args.yt_proxy
    yt.run_map(Mapper(args.keep_lang), args.input_table, args.output_table)

if __name__ == '__main__':
    main()

