#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from __future__ import division
import sys
import os
import codecs
import argparse
import datetime
import json
import copy
from collections import defaultdict


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--dups')
    parser.add_argument('--metadata')
    parser.add_argument('--sne')
    parser.add_argument('--players')
    parser.add_argument('--output')
    args = parser.parse_args()

    now = int(datetime.datetime.now().strftime('%s'))
    boilerplate = {'ts': now, 'probability': 1, 'player': ""}

    players = {
        x['ji']['url']: x['ji']['code']
        for x in json.load(open(args.players))
    }
    good_urls = set()
    metadata = {x['url']: x for x in json.load(open(args.metadata))}

    sne = defaultdict(lambda: set())
    with codecs.open(args.sne, 'r', 'utf8') as f:
        for line in f:
            tabs = line.strip().split('\t')
            if len(tabs) < 4:
                continue
            query = tabs[0]
            season = int(tabs[-2])
            episode = int(tabs[-1])
            sne[(season, episode)].add(query)

    result = set()

    with codecs.open(args.dups, 'r', 'utf8') as f:
        for line in f:
            tabs = line.strip().split('\t')
            if len(tabs) < 2:
                continue
            page_url = tabs[0]
            reference_url = tabs[1]
            season = metadata[reference_url]['season']
            episode = metadata[reference_url]['episode']
            if page_url in players:
                pr = 1
            else:
                pr = 0.6
            for query in sne[(season, episode)]:
                result.add(
                    '{query}\t{url}\tREL\t{ts}\t{pr}\t{pl}'.format(
                        query=query,
                        url=page_url,
                        pr=pr,
                        ts=now,
                        pl=players.get(page_url) or ''
                    )
                )

    with codecs.open(args.output, 'w', 'utf8') as f:
        f.write('\n'.join(sorted(result)) + '\n')


if __name__ == "__main__":
    main()
