#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division
import sys
import argparse
import json
import datetime
import time
import random
from collections import defaultdict

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--input", dest="input", required = True)
    parser.add_argument("--honeypot", dest="honeypot", default = False)
    parser.add_argument("--out_screen", dest="out_screen", required = True)
    parser.add_argument("--out_mark", dest="out_mark", required = True)
    return parser

HONEYPOT_NUMBER = 50
TASKS_NUMBER = 200

def main():
    args = HandleOption().parse_args()

    out_urls = defaultdict(int)
    out_hosts = {}
    with open(args.input, "r") as input:
        for line in input:
            tmp = line.strip().split()
            if args.honeypot:
                host = tmp[0]
                url = tmp[1]
                mark =  tmp[2]
                if not host in out_hosts:
                    out_hosts[host] = {}
                out_hosts[host][url] = mark
            else:
                url = tmp[0]
                if "news.yandex.ru" in url:
                    continue
                out_urls[url] += 1

    out = []
    marked = []
    if args.honeypot:
        for host in out_hosts.keys():
            host_ok = 0
            host_bad = 0
            url_key = out_hosts[host].keys()
            random.shuffle(url_key)
            for url in url_key:
                if out_hosts[host][url] == "ok" and host_ok < HONEYPOT_NUMBER/2:
                    out.append({"url" : url})
                    marked.append("\t".join([url,"ok"]) + "\n")
                    host_ok += 1
                elif out_hosts[host][url] != "ok" and host_bad < HONEYPOT_NUMBER/2:
                    out.append({"url" : url})
                    marked.append("\t".join([url,"bad"]) + "\n")
                    host_bad += 1
                if host_bad >= HONEYPOT_NUMBER/2 and host_ok >= HONEYPOT_NUMBER/2:
                    break
    else:
        total = 0
        for url in out_urls.keys():
            out.append({"url" : url})
            marked.append("\t".join([url, str(out_urls[url])]) + "\n")
            total += 1
            if total >= TASKS_NUMBER:
                break

    with open(args.out_screen, "w") as out_screen:
        json.dump(out, out_screen, indent = 4)
    with open(args.out_mark, "w") as out_mark:
        out_mark.writelines(marked)


if __name__ == '__main__':
    main()
