import json, sys, random, codecs
from itertools import combinations

if len(sys.argv) != 5:
    print >>sys.stderr, 'Usage:', sys.argv[0], '<in_filename> <total_queries> <docs_per_query> <out_filename>'
    sys.exit(1)

serp_filename = sys.argv[1]
total_queries = int(sys.argv[2])
docs_per_query = int(sys.argv[3])
out_filename = sys.argv[4]

with codecs.open(serp_filename, 'r', 'utf8') as f:
    all_serps = json.load(f, encoding='utf8')
    print >>sys.stderr, "Input file is usccessfully loaded"

serps = random.sample(all_serps, total_queries)

mined_pairs = []

for s_i, s in enumerate(serps, 1):
    sys.stderr.write("\rProcessing {} of {} serp".format(s_i, len(serps)))

    query_text = s['query']['text']
    query_country = s['query']['country']
    current_components = []

    # filter candidates
    for comp in s['components']:
        comp_rank = comp['componentInfo']['rank']
        uniq_candidates = []
        candidates_set = {}
        for cand_i, cand in enumerate(comp['imageadd']['candidates'], 1):
            if cand not in candidates_set:
                uniq_candidates.append((cand_i, cand))
                candidates_set[cand] = 1
        current_components.append((comp_rank, uniq_candidates))

    # get inner-document pairs
    comps_for_inner_dups = filter(lambda x: len(x[1]) >= 2, current_components)
    for comp_rank, candidates in random.sample(comps_for_inner_dups, min(docs_per_query, len(comps_for_inner_dups))):
        cand_main = candidates[0]
        cand_another = random.choice(candidates[1:])

        mined_pairs.append( {'query_text': query_text,
                            'query_country': query_country,
                            'duplicate_type': 'inner_doc',
                            'image1': {'url': cand_main[1], 'candidate_rank': cand_main[0], 'component_rank': comp_rank},
                            'image2': {'url': cand_another[1], 'candidate_rank': cand_another[0], 'component_rank': comp_rank}
                            })

    # get cross-document pairs
    comp_combinations = list(combinations(current_components, 2))
    for comp1, comp2 in random.sample(comp_combinations, min(docs_per_query, len(comp_combinations))):
        if random.randint(0, 1) == 1:
            comp1, comp2 = comp2, comp1

        comp_main_rank = comp1[0]
        comp_another_rank = comp2[0]

        comp_main_cand = comp1[1][0]
        comp_another_cand = random.choice(comp2[1])

        mined_pairs.append( {'query_text': query_text,
                            'query_country': query_country,
                            'duplicate_type': 'cross_doc',
                            'image1': {'url': comp_main_cand[1], 'candidate_rank': comp_main_cand[0], 'component_rank': comp_main_rank},
                            'image2': {'url': comp_another_cand[1], 'candidate_rank': comp_another_cand[0], 'component_rank': comp_another_rank}
                            })

print >>sys.stderr, "\nSerps parsing is finished"

with codecs.open(out_filename, 'w', 'utf8') as f:
    json.dump(mined_pairs, f, ensure_ascii=False, indent=2)
