#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json
import codecs
import argparse
import itertools
import datetime
import copy


def parse_duration(s):
    if not isinstance(s, basestring):
        return 0.01
    sp = s.split(':')
    if len(sp) < 2:
        return 0.01
    try:
        mins = int(sp[0])
    except ValueError:
        mins = 0
    try:
        secs = int(sp[1])
    except ValueError:
        secs = 0
    return (60 * mins + secs) or 0.01


def not_dups(dur1, dur2):
    return abs(dur2 - dur1) > (15 * 60) and (
        (dur1 / dur2 <= 0.5) or (dur1 / dur2 >= 2)
    )


boilerplate = {
    "inputValues": {
        "url1": "",
        "url2": "",
    },
    "outputValues": {
        "result": "DIFFERENT_ALGO"
    },
    "probability": 1.0,
    "submitTs": int(datetime.datetime.now().strftime('%s')),
    "algorithm": "majority_vote",
}


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('input')
    parser.add_argument('output')
    parser.add_argument('output_serps')
    parser.add_argument('not_dups_list')
    args = parser.parse_args()

    obj = json.load(open(args.input))
    serps_output = []
    result = []
    player_urls = {}
    durations = {}
    not_dups_list = []
    for it in obj:
        urls = []
        if 'components' not in it:
            continue
        for x in it['components']:
            if len(urls) >= 10:
                continue
            page_url = x['componentUrl']['pageUrl']
            urls.append(page_url)
            player_urls[page_url] = x.get('thumbadd', {}).get(
                'videoPlayerHtml'
            ) or ''
            durations[page_url] = parse_duration(x.get('text.videoDuration'))
        serps_output.append(
            {
                'query': it['query']['text'],
                'serp': urls
            }
        )
        for pair in itertools.combinations(urls, 2):
            pair_ = sorted(pair)
            if not_dups(durations[pair_[0]], durations[pair_[1]]):
                new = copy.deepcopy(boilerplate)
                new['inputValues']['url1'] = pair_[0]
                new['inputValues']['player_url1'] = player_urls[pair_[0]]
                new['inputValues']['url2'] = pair_[1]
                new['inputValues']['player_url2'] = player_urls[pair_[1]]
                not_dups_list.append(new)
            else:
                result.append(pair_)

    json.dump(
        {'pairs': result, 'player_urls': player_urls},
        open(args.output, 'w'), indent=2
    )
    json.dump(
        serps_output, codecs.open(args.output_serps, 'w', 'utf8'),
        indent=2, sort_keys=True, ensure_ascii=False
    )
    json.dump(
        not_dups_list, codecs.open(args.not_dups_list, 'w', 'utf8'),
        indent=2, sort_keys=True, ensure_ascii=False
    )


if __name__ == "__main__":
    main()
