from __future__ import division

from collections import defaultdict
import sys
import re
import datetime
import requests
import urllib
import urllib2
import urlparse
import json
import time

__author__ = 'chikachoff'

import yt.wrapper as yt


def module_filter(module):
    if not module:
        return True
    name = getattr(module, '__name__', '')
    return not (name == 'uatraits' or name.startswith('statbox'))


yt.config["auto_merge_output"]["action"] = "merge"
yt.config["pickling"]["module_filter"] = module_filter
yt.config.set_proxy('plato.yt.yandex.net')
yt.config.CREATE_RECURSIVE = True
yt.config.TREAT_UNEXISTING_AS_EMPTY = True


#class MapQueries():
#
#    def __call__(self, rec):
#        if rec['query'] == query:
#            yield {'query': rec['query'],
#                   'tags': rec['tags']}

class Joiner():
    def __call__(self, key, recs):
        in_first = False
        in_second = False
        query = ''
        tags = {}
        for rec in recs:
            if rec['tableindex'] == '0':
                in_first = True
            elif rec['tableindex'] == '1':
                in_second = True
                tags = rec['tags']

        if in_first and in_second:
            yield {'query': key['query'],
                   'tags': str(tags)}

def main():
    #top_queries_table = yt.smart_upload_file('topQ100K.tsv', placement_strategy='ignore')
    #source_table= '//home/search-research/alexkuk/research/research-952/map_markup'
    #result_table = '//home/tr-analysts/chikachoff/topqueries100K_clusters'
    #for query in queries:
    #    yt.run_map(
    #        MapQueries(),
    #        source_table=source_table,
    #        destination_table=yt.TablePath(result_table, append=True),
    #        format=yt.DsvFormat()
    #        )

    #yt.run_sort(
    #    source_table=result_table,
    #    destination_table=result_table,
    #    sort_by=['query']
    #    )
    inputs = [
        '//home/tr-analysts/chikachoff/queries',
        '//home/tr-analysts/chikachoff/queries_tags'
    ]

    output = '//home/tr-analysts/chikachoff/joined'

    yt.run_reduce(
        Joiner(),
        source_table=inputs,
        destination_table=output,
        reduce_by=['query']
    )

    yt.run_sort(
        source_table=output,
        destination_table=output,
        sort_by=['query']
        )

if __name__ == '__main__':
    main()
