import codecs
import sys
import os
import json
import re
import argparse
import logging

logging.basicConfig(format='%(filename)s[LINE:%(lineno)d]# %(levelname)-8s [%(asctime)s]  %(message)s', level=logging.DEBUG)

def create_argument_parser():
    parser = argparse.ArgumentParser()

    parser.add_argument('--input-serpsets-dir', required=True)
    parser.add_argument('--output-serpsets-dir', required=True)

    return parser


def remove_unnecessary_fields(data):
    ALLOWED_SERP_FIELDS = ['query', 'components', 'headers', 'type', 'texts.labels']
    ALLOWED_SERP_PREFIX = ['serp_query_param.', 'double.', 'text.']
    ALLOWED_COMPONENT_FIELDS = ['componentInfo', 'componentUrl', 'type', 'imageadd', 'webadd', 'thumbadd', \
                                'url.imageUrl', 'url.imageBigThumbHref', 'dimension.SCRAPER_IMAGE_DIMENSION', 'long.mtime']
    ALLOWED_COMPONENT_PREFIX = ['text.', 'double.', 'json.', 'long.market']

    for serp in data:
        for serp_key in serp.keys():
            if serp_key in ALLOWED_SERP_FIELDS:
                continue
            if any([serp_key.startswith(prefix) for prefix in ALLOWED_SERP_PREFIX]):
                continue
            serp.pop(serp_key)
        if 'components' not in serp:
            continue
        components = serp['components']
        for comp in components:
            for comp_key in comp.keys():
                if comp_key in ALLOWED_COMPONENT_FIELDS:
                    continue
                if any([comp_key.startswith(prefix) for prefix in ALLOWED_COMPONENT_PREFIX]):
                    continue
                comp.pop(comp_key)

    return data


def freon_v2_filter_packed_serpsets(args):
    logging.info("Collecting files")
    serpset_filenames = []
    for fname in os.listdir(args.input_serpsets_dir):
        if re.match(r'^[0-9]+\.json$', fname):
            serpset_filenames.append((os.path.join(args.input_serpsets_dir, fname),
                                        os.path.join(args.output_serpsets_dir, fname)))
    logging.info("Found: {}".format(json.dumps([ names[0] for names in serpset_filenames ])))

    if len(serpset_filenames) == 0:
        return

    if not os.path.exists(args.output_serpsets_dir):
        logging.info("Creating output dir {}".format(args.output_serpsets_dir))
        os.makedirs(args.output_serpsets_dir)

    for in_serpset_filename, out_serpset_filename in serpset_filenames:
        logging.info("Loading {}".format(in_serpset_filename))
        with codecs.open(in_serpset_filename, 'r', 'utf8') as f:
            cur_serpset = json.load(f, encoding='utf8')

        logging.info("Filtering {}".format(in_serpset_filename))
        filtered_serpset = remove_unnecessary_fields(cur_serpset)

        logging.info("Dumping to {}".format(out_serpset_filename))
        with codecs.open(out_serpset_filename, 'w', 'utf8') as f:
            cur_serpset = json.dump(filtered_serpset, f, ensure_ascii=False, indent=2)


def main():
    args = create_argument_parser().parse_args()
    freon_v2_filter_packed_serpsets(args)


if __name__ == '__main__':
    main()
