# -*- coding: utf-8 -*-
import argparse
import datetime
import json
import random
import sys
import re
import string
import urllib2
import time
import codecs
import os
from collections import defaultdict

def rtmr_get_records(configs, params):
    GET_RTMR_RECORDS = "{0}/yandsearch?view=hr&table={1}&key={2}&maxrecords={3}&maxtimeback={4}&client={5}"

    result = []

    for it in range(1 + params.get('n_extra_tries', 0)):
        if it > 0:
            print >>sys.stderr, "Sleep before next retry"
            time.sleep(params.get('timeout', 1800))
        for config in configs:
            key = {'device': config['device'],
                    'domain': config['domain'],
                    'intent': config['intent'],
                    'search': config['search']}
            key = urllib2.quote(json.dumps(key).replace(' ', ''))
            url = GET_RTMR_RECORDS.format(params['service'],
                                        params['table'],
                                        key,
                                        config['maxrecords'],
                                        params['maxtimeback'],
                                        params['search_type'])

            print >>sys.stderr, "Open url '{0}'".format(url)
            request = urllib2.Request(url, None, {})
            response = json.loads(urllib2.urlopen(request).read())

            if response != None:
                print >>sys.stderr, "'{0} / {1}' returns {2} records".format(config['intent'],
                                        config['search'], len(response.get('Entries', [])))
                records = response['Entries']
                if config['shuffle'] == 'yes':
                    random.shuffle(records)

                for idx in range(len(records)):
                    record = records[idx]
                    query = record['Value'].replace('\\', '').replace('"', '')
                    while query.find('  ') != -1:
                        query = query.replace('  ', ' ')
                    if query == "":
                        continue

                    date = datetime.datetime.now().strftime("%Y-%m-%d")
                    config_type = '/'.join([config['search'], config['intent'], config['device']])
                    result.append({
                        'query_text': query,
                        'query_region_id': int(config['region_id']),
                        'query_date': date,
                        'type': config_type,
                        'search_type': params['search_type'],
                        'query_country': config['domain'].upper(),
                        'query_device': config['device']
                    })

    return result


def sample_queries_with_hour_and_country_pdf(data, config, hour_pdf):
    out_data = []
    data_by_hour = defaultdict(list)
    hour_share = {x['hour']: x['share'] for x in hour_pdf}

    for elem in data:
        hour = elem['serp_date'].split('T')[1].split(':')[0]

        if int(hour) < 0 or int(hour) > 23:
            print >>sys.stderr, "Unknown hour", hour, elem['serp_date']
            sys.exit(1)

        if hour not in hour_share:
            print >>sys.stderr, "Hour not found in hour PDF config", hour
            sys.exit(1)

        data_by_hour[hour].append(elem)

    for hour in sorted(data_by_hour.keys()):
        print "Running sampling for hour", hour

        needed_elems = int(config['limit'] * hour_share[hour] + 0.999)
        print "Requested", needed_elems, "present", len(data_by_hour[hour])

        new_queries = sample_queries_with_distribution(data_by_hour[hour], dict(config, limit=needed_elems), exclude_list=out_data)

        print "Got", len(new_queries)
        out_data.extend(new_queries)

    random.shuffle(out_data)
    return out_data[:config['limit']]


def get_queries_from_serps(filenames, append_query_date=False):
    DEVICE_MAP = {0: 'DESKTOP', 1: 'ANDROID', 2: 'IPHONE', 3: 'UNKNOWN', 4: 'WINDOWS_PHONE'}

    out_data = []

    for fname in filenames:
        if not re.match(r'^[0-9]+\.json$', fname):
            continue
        print "Parsing", fname
        serp_set_id = int(fname.split('.')[0])

        with codecs.open(fname, 'r', 'utf8') as f:
            data = json.load(f, encoding='utf8')

        for serp in data:
            if 'query' not in serp:
                continue

            query = serp['query']
            out_elem = {'query_text': query['text'],
                        'query_region_id': query['regionId'],
                        'query_country': query['country'],
                        'query_device': DEVICE_MAP[query['device']],
                        'serp_set_id': serp_set_id
                        }
            if append_query_date:
                out_elem['query_date'] = serp['serp_query_param.query_date']

            out_data.append(out_elem)

    return out_data


def sample_queries_with_distribution(data, config, exclude_list=[]):
    out_data = []
    total_in = 0
    data_by_country = defaultdict(list)
    excluded_queries = set([x['query_text'] for x in exclude_list])

    for elem in data:
        if elem['query_country'] in config['countries_distribution'] and elem['query_text'] not in excluded_queries:
            data_by_country[elem['query_country']].append(elem)
            total_in += 1

    print "Elements after exclusion", total_in

    for country in data_by_country:
        needed_elems = int(config['countries_distribution'][country] / 100.0 * config['limit'] + 0.5)
        total_elems = len(data_by_country[country])
        n_sampled = min(needed_elems, total_elems)
        out_data.extend(random.sample(data_by_country[country], n_sampled))
        print >>sys.stderr, "Country {}: total {}, sampled {}".format(country, total_elems, n_sampled)

    random.shuffle(out_data)
    return out_data[:config['limit']]


def sample_queries_with_distribution_fresh_priority(data, config):
    out_data = []
    unused_fresh = []
    unused_possible = []
    data_by_country = defaultdict(list)

    for elem in data:
        if elem['query_country'] in config['countries_distribution']:
            data_by_country[elem['query_country']].append(elem)

    for country in data_by_country:
        needed_elems = int(config['countries_distribution'][country] / 100.0 * config['limit'] + 0.5)
        total_elems = len(data_by_country[country])
        n_sampled = min(needed_elems, total_elems)

        only_fresh_data = [ x for x in data_by_country[country] if x['image_queryfresh'] == 'IMAGE_FRESH' ]
        possible_fresh_data = [ x for x in data_by_country[country] if x['image_queryfresh'] == 'IMAGE_POSSIBLE' ]

        n_fresh_sampled = min(needed_elems, len(only_fresh_data))
        n_possible_sampled = min(needed_elems - n_fresh_sampled, len(possible_fresh_data))

        if n_fresh_sampled > 0:
            random.shuffle(only_fresh_data)
            out_data.extend(only_fresh_data[:n_fresh_sampled])
            unused_fresh.extend(only_fresh_data[n_fresh_sampled:])

        if not config.get('suppress_fresh_possible') and n_possible_sampled > 0:
            random.shuffle(possible_fresh_data)
            out_data.extend(possible_fresh_data[:n_possible_sampled])
            unused_possible.extend(possible_fresh_data[n_possible_sampled:])
        else:
            unused_possible.extend(possible_fresh_data)

        print >>sys.stderr, "Country {}: total {}, sampled {} (fresh {}, possible {})".format(\
                            country, total_elems, n_sampled, n_fresh_sampled, n_possible_sampled)

    random.shuffle(out_data)
    data_limited = out_data[:config['limit']]
    unused_data = out_data[config['limit']:]

    if config.get('expand_with_fresh') and len(data_limited) < config['limit']:
        random.shuffle(unused_fresh)
        data_extended = data_limited + unused_fresh
        data_limited = data_extended[:config['limit']]
        random.shuffle(data_limited)
        unused_data.extend(data_extended[config['limit']:])

    unused_data.extend(unused_possible)

    n_elems = len(data_limited)
    n_eighth = min((n_elems + 7) / 8, n_elems)
    n_quarter = min((n_elems + 3) / 4, n_elems)
    n_half = min(n_elems / 2, n_elems)

    sample_eighth = random.sample(data_limited, n_eighth)
    for elem in sample_eighth:
        elem['sample_fresh_eighth'] = True

    sample_quarter = random.sample(data_limited, n_quarter)
    for elem in sample_quarter:
        elem['sample_fresh_quarter'] = True

    sample_half = random.sample(data_limited, n_half)
    for elem in sample_half:
        elem['sample_fresh_half'] = True

    if config.get('output_everything'):
        result = [ dict(elem, take=True) for elem in data_limited ] + \
                [ dict(elem, take=False) for elem in unused_data ]
        return result
    else:
        return data_limited


def unique_text_by_country(data):
    seen = set()
    out_data = []

    for elem in data:
        query_text = elem['query_text'].lower().encode('utf8')
        query_text = query_text.translate(None, string.punctuation)
        query_text = query_text.strip()
        query_text = re.sub(r'\s\s+', ' ', query_text)
        key = (query_text, elem['query_country'].lower())
        if key not in seen:
            seen.add(key)
            out_data.append(elem)
        else:
            print >>sys.stderr, "Duplicate", elem['query_country'], elem['query_text']

    return out_data


def search_link_a(query):
    return u'<a href="https://yandex.ru/images/search?text={query}">{query}</a>'.format(query=query)


def make_email_body(data, params, statuses):
    other_country = 'OTHER'
    countries = ['RU', 'BY', 'KZ', 'UA', other_country]

    data_by_country = {}

    for country in countries:
        data_by_country[country] = []

    for elem in data:
        elem_country = elem.get('query_country', other_country)
        if elem_country not in data_by_country:
            elem_country = other_country
        data_by_country[elem_country].append(elem)

    html = u'<html>'
    html += '<body>'

    html += 'Total queries count: {}'.format(len(data))

    html += '<br/><br/>Countries stats:'
    html += '<table border="1">'
    for country in countries:
        n_elems = len(data_by_country[country])
        if n_elems == 0:
            continue
        html += '<tr><td>{}</td><td>{}</td></tr>'.format(country, n_elems)
    html += '</table>'

    for country in countries:
        n_elems = len(data_by_country[country])
        if n_elems == 0:
            continue
        html += '<br/><br/><b>{}</b>'.format(country)
        for status in statuses:
            cur_elems = [ x for x in data_by_country[country] if x['image_queryfresh'] == status ]
            if len(cur_elems) == 0:
                continue

            cur_elems.sort(key=lambda x: x['query_text'].lower())

            html += '<br/>{}<br/>'.format(status)
            html += '<table>'
            for elem in cur_elems:
                html += u'<tr><td>{}<td/><td>{}</td><td>{}</td></tr>'.format(search_link_a(elem['query_text']), elem['query_device'], elem['query_source'])
            html += '</table>'

    html += '<br/><br/><a href="https://staff.yandex-team.ru/scrapbrain">@scrapbrain</a>'
    html += '</body>'
    html += '</html>'

    return [{'body': html, 'date': params['date']}]
