import json, sys, argparse, codecs
from download_files.download_images import download_images
from image_processing.random_transform import multiple_images_transform
from avatars_operations.utils import upload_multiple_images
from collections import defaultdict

def create_argument_parser():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--in-filename',
        required=True,
    )

    parser.add_argument(
        '--out-filename',
        required=True,
    )

    parser.add_argument(
        '--avatars-processes',
        type=int,
        required=True,
    )

    parser.add_argument(
        '--toloka-namespace',
        required=True,
    )

    parser.add_argument(
        '--upload-try-count',
        type=int,
        required=True,
    )

    parser.add_argument(
        '--n-pos-transforms',
        type=int,
        required=True,
    )

    parser.add_argument(
        '--n-neg-transforms',
        type=int,
        required=True,
    )

    parser.add_argument(
        '--mds-environment',
        required=True,
    )

    parser.add_argument(
        '--image-field',
        required=True,
    )

    parser.add_argument(
        '--pos-dups-fraction',
        type=float,
        required=True,
    )

    return parser


def json_deep_find(path, json_obj):
    result = json_obj
    for key in path.split('.'):
        result = result[key]
    return result


def main():
    args = create_argument_parser().parse_args()

    with codecs.open(args.in_filename, 'r', 'utf8') as in_f:
        data = json.load(in_f)

    in_images = [ json_deep_find(args.image_field, elem) for elem in data]

    downloaded_images = download_images(in_images, 'src_images')

    expected_pos_dups = int(args.pos_dups_fraction * len(downloaded_images))
    images_for_pos_dups = downloaded_images[:expected_pos_dups]
    images_for_neg_dups = downloaded_images[expected_pos_dups:]

    pos_duplicates = multiple_images_transform(images_for_pos_dups, 'pos', args.n_pos_transforms, 'duplicates')
    neg_duplicates = multiple_images_transform(images_for_neg_dups, 'neg', args.n_neg_transforms, 'duplicates')
    generated_duplicates = pos_duplicates + neg_duplicates

    labels_dict = {}
    for f in generated_duplicates:
        labels_dict[f['url']] = f['label']

    print generated_duplicates

    files_for_avatars = [ {'key': f['url'], 'file_path': f['filename'], 'errors': ''} for f in generated_duplicates ] + \
                        [ {'key': f['url'], 'file_path': f['transform'], 'errors': ''} for f in generated_duplicates ]
    uploaded_images, _ = upload_multiple_images(files_for_avatars, args.avatars_processes, args.toloka_namespace, args.mds_environment, args.upload_try_count)

    duplicates_dict = defaultdict(list)
    for img in uploaded_images:
        duplicates_dict[img['key']].append(img['avatars_url'])

    result = []
    for url in duplicates_dict:
        dups = duplicates_dict[url]
        if len(dups) != 2 or url not in labels_dict:
            print >>sys.stderr, "Not all duplicates were uploaded to avatars for {}".format(url)
            continue
        result.append({'inputValues': {'image_left': dups[0], 'image_right': dups[1]}, 'knownSolutions': [ {'outputValues': {'result': labels_dict[url]}} ]})

    with codecs.open(args.out_filename, 'w', 'utf8') as out_f:
        json.dump(result, out_f, indent=4, ensure_ascii=False)


if __name__ == '__main__':
    main()
