from __future__ import division

from collections import defaultdict
import sys
import re
import datetime
import requests
import urllib
import urllib2
import urlparse
import json
import time

__author__ = 'chikachoff'

import yt.wrapper as yt


def module_filter(module):
    if not module:
        return True
    name = getattr(module, '__name__', '')
    return not (name == 'uatraits' or name.startswith('statbox'))

yt.config["auto_merge_output"]["action"] = "merge"
yt.config["pickling"]["module_filter"] = module_filter
yt.config.set_proxy('plato.yt.yandex.net')
yt.config.CREATE_RECURSIVE = True
yt.config.TREAT_UNEXISTING_AS_EMPTY = True


class WatchLogMapper:

    def __init__(self):
        pass

    def __call__(self, rec):
        data = defaultdict(str)

        referer = rec['referer']
        yandexuid = rec['yandexuid']
        region_id = rec['region_id']

        if referer.startswith('http://www.google.ru/') or referer.startswith('https://www.google.ru/'):
            data['search'] = 'google'
        elif referer.startswith('https://yandex.ru/') or referer.startswith('http://yandex.ru/') or referer.startswith('http://www.yandex.ru/') or referer.startswith('https://www.yandex.ru/'):
            data['search'] = 'yandex'

        if referer and yandexuid and data['search']:
            yield {
                'yandexuid': yandexuid,
                'search': data['search'],
                'referer': referer
                }


class UniqueReducer():

    def __call__(self, key, recs):
        searches = defaultdict(int)

        for rec in recs:
            search = rec['search']
            searches[search] += 1

        if searches['google'] == 0:
            google_share = 0
        elif searches['yandex'] == 0:
            google_share = 1
        else:
            google_share = searches['google'] / (searches['google'] + searches['yandex'])

        yield {
            'yandexuid': key['yandexuid'],
            'yandex_count': str(searches['yandex']),
            'google_count': str(searches['google']),
            'google_share': str(google_share*100)}


class FinalReducer():

    def __call__(self, key, recs):
        google_count = 0
        yandex_count = 0
        for rec in recs:
            google_count += int(rec['google_count'])
            yandex_count += int(rec['yandex_count'])

        google_share = google_count / (google_count + yandex_count)

        yield {'yandexuid': key['yandexuid'],
            'yandex_count': str(yandex_count),
            'google_count': str(google_count),
            'google_share': str(google_share*100)}


class FinalMap():
    def __init__(self):
        pass

    def __call__(self, rec):
        if int(rec['google_count']) > 3 and float(rec['google_share']) > 50:
            yield {'yandexuid': rec['yandexuid']}


def main():
    days = ['12', '13', '14', '15', '16', '17', '18']
    prefix = '//statbox/watch-log/2015-10-'
    searches_table = '//home/tr-analysts/chikachoff/searches-2015-10-08-map'
    unique_searches_table = '//home/tr-analysts/chikachoff/searches-2015-10-08-reduce'
    final_reduce = '//home/tr-analysts/chikachoff/searches-2015-10-finalreduce'
    final_uids = '//home/tr-analysts/chikachoff/searches-2015-10-uids-searchesgt3'
    for day in days:
        source_table = prefix + day
        yt.run_map(
            WatchLogMapper(),
            source_table=source_table,
            destination_table=yt.TablePath(searches_table),
            format=yt.DsvFormat())
        yt.run_sort(
            source_table=searches_table,
            destination_table=searches_table,
            sort_by=['yandexuid'])
        yt.run_reduce(
            UniqueReducer(),
            source_table=searches_table,
            destination_table=yt.TablePath(unique_searches_table, append=True),
            reduce_by=['yandexuid'],
            format=yt.DsvFormat())
        yt.run_erase(searches_table)
    yt.run_sort(
        source_table=unique_searches_table,
        destination_table=unique_searches_table,
        sort_by=['yandexuid'])
    yt.run_reduce(
        FinalReducer(),
        source_table=unique_searches_table,
        destination_table=final_reduce,
        reduce_by=['yandexuid'],
        format=yt.DsvFormat())
    yt.run_erase(unique_searches_table)
    yt.run_map(
        FinalMap(),
        source_table=final_reduce,
        destination_table=final_uids,
        format=yt.DsvFormat())
    yt.run_erase(final_reduce)


if __name__ == '__main__':
    main()
