#!/usr/bin/env python
# coding: utf-8

import sys
reload(sys)
sys.setdefaultencoding("utf-8")

import os
import json
import random
import itertools
import numpy as np
from scipy import spatial
from datetime import date, datetime
from collections import defaultdict, Counter


def isfloat(value):
    try:
        float(value)
        return True
    except ValueError:
        return False


def get_relev(relev_str):
    parts = relev_str.split(';')
    relev = dict([x.split('=')[:2] for x in parts])
    for k, v in relev.iteritems():
        if isfloat(v):
            relev[k] = float(v)
    return relev


def main(*args):
    queries_list, in2, in3, token, any_param, html_file = args

    html_file.write('<pre>')
    cnt = Counter()
    l = len(queries_list)


    uniq_dopp_queries = {}
    result = []
    for idx, q in enumerate(queries_list):
        # склейка одинаковых запросов по доппельгангерам
        try:
            dopp_query = q['begemot_data']['answers'][0]['rules']['Doppelgang']['DnormForWildcards']
        except:
            print('No Doppelgang for query: "{}"'.format(q['query']))
        else:
            if dopp_query in uniq_dopp_queries:
                cnt['dopp_norm_duplicate'] += 1
                html_file.write('filter "{}", query: "{}" orig: "{}"\n'.format('dopp_norm_duplicate', q['query'], uniq_dopp_queries[dopp_query]))
                continue
            else:
                uniq_dopp_queries[dopp_query] = q['query']

        relev = get_relev(q['begemot_data']['answers'][0]['relev'])

        if relev.get('cm2') >= 0.5 and relev.get('tvm') >= 0.1:
            cnt['commercial'] += 1
            html_file.write('filter "{}", query: "{}"\n'.format('commercial', q['query']))
            continue

        if relev.get('pr') >= 0.9:
            cnt['porno'] += 1
            html_file.write('filter "{}", query: "{}"\n'.format('porno', q['query']))
            continue

        if relev.get('qmpl', 'ru') != 'ru':
            cnt['lang_not_rus'] += 1
            html_file.write('filter "{}", query: "{}"\n'.format('lang_not_rus', q['query']))
            continue

        print '{}/{}) query: "{}"'.format(idx, l, q['query'])

        del q['begemot_data']
        result.append(q)
        # yield q

    print datetime.now(), 'done'
    html_file.write('\n\nfiltered by categories: {}\n'.format(cnt.most_common()))

    return result
