import argparse
import os

import yt.wrapper as yt
import sys


ADD_CGI = '&pron=ignore_db_timestamp&pron=nosmfa&allfctrs=da'
REMOVE_PRONS = ('newest_runtime_docids', 'smautodegrade',)


class EventFilter(object):
    def __call__(self, row):
        if row.get('event_type') in ['EnqueueYSRequest', 'SubSourceRequest']:
            yield row


class RequestFilter(object):
    def __init__(self, search, factors, snippets, collection):
        self._search = search
        self._factors = factors
        self._snippets = snippets
        self._collection = collection

    def _filter_search(self, request):
        if self._search and '&dh=' not in request:
            return request

    def _filter_factors(self, request):
        if self._factors and '&allfctrs=da' in request:
            return request

    def _filter_snippets(self, request):
        if self._snippets and '&DF=da' in request:
            return request

    def _filter_request(self, request):
        filtered = [
            r
            for r in (
                self._filter_search(request),
                self._filter_factors(request),
                self._filter_snippets(request),
            )
            if r
        ]

        return filtered.pop() if filtered else None

    def _remove_pron(self, request, pron):
        return '&'.join(
            part
            for part in request.split('&')
            if part and not part.startswith('pron={}'.format(pron))
        )

    def __call__(self, key, rows):
        coll = None
        requests = []

        for row in rows:
            if row.get('event_type') == 'EnqueueYSRequest':
                split = row['event_data'].split()
                if len(split) > 3:
                    request_collection = split[3][1:] # has form /yandsearch or /imagesultra p.e.
                    if request_collection == self._collection:
                        coll = request_collection

            if row.get('event_type') == 'SubSourceRequest':
                request = self._filter_request(row['event_data'].split()[4])
                if request:
                    for pron in REMOVE_PRONS:
                        request = self._remove_pron(request, pron)
                    requests.append({'request': request.replace('http2://', 'http://') + ADD_CGI})

        if coll:
            for request in requests:
                yield request


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--yt-proxy', help='YT proxy', required=True)
    parser.add_argument('--source-table', help='Source table', required=True)
    parser.add_argument('--collection', help='Collection', required=True)
    parser.add_argument('--output-table', help='Output table', required=True)

    request_type_group = parser.add_mutually_exclusive_group()
    request_type_group.add_argument('--search', help='Select only SEARCH queries', action='store_true', default=False)
    request_type_group.add_argument('--factors', help='Select only FACTORS queries', action='store_true', default=False)
    request_type_group.add_argument('--snippets', help='Select only SNIPPETS queries', action='store_true', default=False)

    return parser.parse_args()


def main():
    args = parse_args()

    if any((args.search, args.factors, args.snippets)):
        filter_func = RequestFilter(search=args.search, factors=args.factors, snippets=args.snippets, collection=args.collection)
    else:
        filter_func = RequestFilter(search=True, factors=True, snippets=True, collection=args.collection)

    yt.config['proxy']['url'] = args.yt_proxy
    yt.config['pickling']['python_binary'] = '/skynet/python/bin/python'
    yt.config['token'] = os.environ.get('YT_TOKEN')

    yt.run_map_reduce(EventFilter(), filter_func, args.source_table, args.output_table, reduce_by=['frame_id'])


if __name__ == '__main__':
    main()
