#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
from collections import defaultdict
import json
import gc
import argparse
import itertools
import datetime
import copy


def parse_duration(s):
    if not isinstance(s, basestring):
        return 0.01
    sp = s.split(':')
    if len(sp) < 2:
        return 0.01
    try:
        mins = int(sp[0])
    except ValueError:
        mins = 0
    try:
        secs = int(sp[1])
    except ValueError:
        secs = 0
    return (60 * mins + secs) or 0.01


def get_duration(dct):
    s = dct.get('NULLABLE___COMPONENT___text.videoDuration', '')
    if s:
        return s
    s = dct.get('NULLABLE___COMPONENT___text.videoDudation', '')
    return s


def not_dups(dur1, dur2):
    return abs(dur2 - dur1) > (15 * 60) and (
        (dur1 / dur2 <= 0.5) or (dur1 / dur2 >= 2)
    )


boilerplate = {
    "inputValues": {
        "url1": "",
        "url2": "",
    },
    "outputValues": {
        "result": "DIFFERENT_ALGO"
    },
    "probability": 1.0,
    "submitTs": int(datetime.datetime.now().strftime('%s')),
    "algorithm": "majority_vote",
}


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--input')
    parser.add_argument('--output_serps')
    parser.add_argument('--output_pairs')
    parser.add_argument('--not_dups_list')

    args = parser.parse_args()

    obj = json.load(open(args.input))

    serps = defaultdict(lambda: [])

    for x in obj:
        serps[x['query_text']].append(x)
    del obj
    gc.collect()

    pairs = []
    not_dups_list = []

    for query in serps:
        serps[query] = sorted(
            serps[query], key=lambda x: x['component_position']
        )
        for component in serps[query]:
            pass
        for pair in itertools.combinations(
            serps[query][:10], 2
        ):
            pair = sorted(pair, key=lambda x: x['component_page_url'])
            dur0 = parse_duration(get_duration(pair[0]))
            dur1 = parse_duration(get_duration(pair[1]))
            url0 = pair[0]['component_page_url']
            url1 = pair[1]['component_page_url']
            player_url0 = pair[0][
                'NULLABLE___COMPONENT___thumbadd'
            ].get('videoPlayerHtml') or ""
            player_url1 = pair[1][
                'NULLABLE___COMPONENT___thumbadd'
            ].get('videoPlayerHtml') or ""
            if not_dups(dur0, dur1):
                new = copy.deepcopy(boilerplate)
                new['inputValues']['url1'] = url0
                new['inputValues']['player_url1'] = player_url0
                new['inputValues']['url2'] = url1
                new['inputValues']['player_url2'] = player_url1
                not_dups_list.append(new)
            else:
                pairs.append(
                    {
                        "url1": url0,
                        "url2": url1,
                        "player_url1": player_url0,
                        "player_url2": player_url1,
                    }
                )

    json.dump(dict(serps), open(args.output_serps, 'w'), indent=2)
    json.dump(pairs, open(args.output_pairs, 'w'), indent=2)
    json.dump(not_dups_list, open(args.not_dups_list, 'w'), indent=2)


if __name__ == "__main__":
    main()
