#-*- coding: UTF-8 -*-

import requests
import json
import codecs
import argparse
import re
import datetime
import os.path
import urllib, urllib2
import sys
import time

def parse_args():
    args = argparse.ArgumentParser()
    args.add_argument("--data", type=str, required=True)
    args.add_argument("--urls_with_content", type=str, required=True)
    args.add_argument("--dups", type=str, required=True)
    args.add_argument("--output", type=str,required=True)
    return args.parse_args()

def main():
    args = parse_args()

    data = json.load(codecs.open(args.data, 'r', encoding="utf-8"))
    urls_with_content = set([elem["url"] for elem in json.load(codecs.open(args.urls_with_content, 'r', encoding="utf-8"))])
    dups = json.load(codecs.open(args.dups, 'r', encoding="utf-8"))

    dups_stats = {}
    for dup in dups:
        url_1 = dup["url_1"]
        url_2 = dup["url_2"]
        dups_stats[url_1] = dups_stats.get(url_1, []) + [url_2]
        dups_stats[url_2] = dups_stats.get(url_2, []) + [url_1]

    data_stats = {}
    for info in data:
        key = (info["query"], info["serp_set_download_id"], info["serp_request_download_id"])
        data_stats[key] = data_stats.get(key, []) + [{"url" : info["url"], "position" : info["position"]}]
        data_stats[key] = sorted(data_stats[key], key=lambda x : x["position"])

    results = []
    for key in data_stats:
        query = key[0]
        serp_set_download_id = key[1]
        serp_request_download_id = key[2]

        prev_urls_pos = {}
        for url_info in data_stats[key]:
            cleaned_url = url_info["url"].split("://")[1]
            has_content = int(cleaned_url in urls_with_content)
            dup_pos = -1
            for url in dups_stats.get(cleaned_url, []):
                if url in prev_urls_pos:
                    print cleaned_url, url, dups_stats[cleaned_url]
                    dup_pos = prev_urls_pos[url]
            prev_urls_pos[cleaned_url] = url_info["position"]
            results.append({"query" : query,
                            "url" : url_info["url"],
                            "position" : url_info["position"],
                            "serp_set_download_id" : serp_set_download_id,
                            "serp_request_download_id" : serp_request_download_id,
                            "has_content" : has_content,
                            "dup_pos" : dup_pos})

    f = open(args.output, 'w')
    f.write(json.dumps(results, indent=2, ensure_ascii=False).encode("utf8"))
    f.close()

if __name__ == '__main__':
    main()
