#!/usr/bin/python
# -*- coding: UTF-8 -*-

# Voronka 1: Fetch toloka hits from morda frontend accesslog
from nile.api.v1 import (
    clusters,
    Record)
import nile
import pandas as pd
import yt.wrapper as yt

yt.config['token'] = 'AVImKQMAAAO3VDAg54V4QjeI8eaNZJSQfA'
yt.config.set_proxy('hahn.yt.yandex.net')


class Morda:
    def __init__(self, date, url_pattern, delay=False):
        self.username = 'chikachoff'
        self.date = date
        self.url_pattern = url_pattern
        self.cluster = clusters.Hahn(
            pool='search-research_{}'.format(self.username),
            token='AVImKQMAAAO3VDAg54V4QjeI8eaNZJSQfA'
        )
        self.job = self.cluster.job()
        self.delay = delay
        self.df = self.put_results_to_df()

    @staticmethod
    def _filter(records):
        for rec in records:
            if '/?source=toloka' in rec.get('request', ''):
                yield rec

    def fetch_results(self):
        results = []
        log_tmpl = "//home/atom/chikachoff/morda-frontend-access-log_toloka_{}"
        out_table = log_tmpl.format(self.date)
        if self.delay:
            out_table = log_tmpl.format('_'.join([self.date, 'delay']))
        if yt.exists(out_table):
            results.append(self.job.table(out_table))
            return results
        else:
            log = self.job.table(
                "//statbox/morda-frontend-access-log/{}".format(self.date))
            log_parsed = log.map(self._filter)
            result = log_parsed.put(out_table)
            results.append(result)
            self.job.run()
            return results

    def put_results_to_df(self):
        print 'Parsing morda logs, preparing morda_df'
        frames = []
        for result in self.fetch_results():
            df = result.read().as_dataframe()
            frames.append(df)
        self.df = pd.concat(frames)
        print 'Parsing morda logs, preparing morda_df. Done'
        return self.df

    def cook_toloka_search_patterns(self, toloka_stages, product):
        toloka_search_patterns = []
        for stage_id in xrange(len(toloka_stages)):
            patterns = []
            for task_id in toloka_stages[stage_id]['indices']:
                pattern = self.url_pattern.split('id=')[1]
                # pattern = '{_id}_{stage}&product={product}&date={date}'
                filled_pattern = pattern.format(
                    _id=str(task_id).zfill(6), stage=stage_id,
                    product=product, date=self.date
                )
                patterns.append(filled_pattern)
            toloka_search_patterns.append('|'.join(patterns))
        return toloka_search_patterns

    def calc_morda_metrics_delay(self, product, stage=3):
        ids_in_morda = set()
        for cgi_params in self.df.request:
            if 'product={}'.format(product) not in cgi_params:
                continue
            params = cgi_params.split('&')
            for param in params:
                if param.startswith('id=') and param.endswith(str(stage)):
                    idx = param.split('=')[1].split('_')[0]
                    ids_in_morda.add(idx)
        return len(ids_in_morda)

    def calc_morda_metrics(self, toloka_stages, product):
        print '\nCalculating morda log metrics. Product: {}'.format(product)
        print 'len(morda_df): {}'.format(len(self.df))
        morda_metrics = []
        for search_pattern in self.cook_toloka_search_patterns(
            toloka_stages=toloka_stages, product=product
        ):
            try:
                has_search_pattern = self.df[
                    self.df.request.str.contains('{}'.format(search_pattern))
                ]
                metric = has_search_pattern.groupby('request').request.nunique().count()
                morda_metrics.append(metric)
            except IndexError:
                pass
        print 'Morda metrics OK'
        return morda_metrics
