#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from __future__ import division
import sys
import os
import codecs
import argparse
import datetime
import json
import copy
from collections import defaultdict


answer_dict = {
    "EQUAL": "REL",
    "DIFFERENT": "NOT_REL",
    "OTHER": "NOT_REL",
    "_404": "404"
}


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('toloka')
    parser.add_argument('output')
    parser.add_argument('_404')
    parser.add_argument('serps', nargs='+')
    args = parser.parse_args()

    toloka = json.load(open(args.toloka))
    _404 = json.load(open(args._404))

    now = int(datetime.datetime.now().strftime('%s'))
    boilerplate = {'ts': now, 'probability': 1, 'player': ""}

    pairs = {}
    for obj in toloka:
        pairs[(obj['ji']['url1'], obj['ji']['url2'])] = {
            "ts": obj['ts'],
            "probability": obj['probability'],
            "result": obj['result']['result'],
            "player": obj['ji']['player_url1']
        }

    result = set()
    for serpfile in args.serps:
        serps = json.load(open(serpfile))
        for serp in serps:
            fp_url = serp['fixed_pair']['url']
            query = serp['query']
            for url in serp['serp']:
                if not url:
                    continue
                if (url, fp_url) in pairs:
                    dct = pairs[(url, fp_url)]
                    result.add(
                        '{query}\t{url}\t{answer}\t{ts}\t{pr}\t{pl}\n'.format(
                            query=query,
                            url=url,
                            answer=answer_dict[dct['result']],
                            ts=dct['ts'],
                            pr=dct['probability'],
                            pl=dct['player']
                        )
                    )
            for url in serp['shit_urls']:
                dct = boilerplate
                result.add(
                    '{query}\t{url}\tDIFFERENT\t{ts}\t{pr}\t{pl}\n'.format(
                        query=query,
                        url=url,
                        ts=dct['ts'],
                        pr=dct['probability'],
                        pl=dct['player']
                    )
                )
            for obj in _404:
                dct = boilerplate
                result.add(
                    '{query}\t{url}\t404\t{ts}\t{pr}\t{pl}\n'.format(
                        query=query,
                        url=obj['ji']['url'],
                        ts=dct['ts'],
                        pr=dct['probability'],
                        pl=obj['ji']['code']
                    )
                )

    with codecs.open(args.output, 'w', 'utf8') as f:
        f.write('\n'.join(sorted(result)) + '\n')


if __name__ == "__main__":
    main()
