import collections
import datetime
import logging
import os

import yt.wrapper as yt

import cars.settings
from cars.aggregator.static_data.operators import OPERATORS
from cars.core.util import ensure_yt_dir, make_yt_client
from cars.feeds import Feed


LOGGER = logging.getLogger(__name__)


class PreprocessorMapper(object):

    def __call__(self, row):
        if row.get('type') != 'feed':
            return

        operator_short_name = row.get('operator')
        if not operator_short_name:
            return
        operator = OPERATORS.get(operator_short_name)
        if not operator:
            LOGGER.warning('Unknown operator: %s', operator_short_name)
            return

        unixtime = row['unixtime']
        date = datetime.datetime.utcfromtimestamp(unixtime).date()
        date_str = date.isoformat()

        feed = row['data']['feed']
        parsed_feed = Feed.parse_feed(operator=operator, feed=feed)

        city_feeds = collections.defaultdict(list)
        for car in parsed_feed:
            city_feeds[car.city_id].append(car)
        city_ids = list(city_feeds)

        n_cars = {city_id: len(city_feed) for city_id, city_feed in city_feeds.items()}

        for city_id in city_ids:
            yield {
                'city_id': city_id,
                'unixtime': unixtime,
                'date': date_str,
                'operator': operator.short_name,
                'n_cars': n_cars[city_id],
            }


class StatsReducer(object):

    def __call__(self, key, rows):
        city_id = key['city_id']
        date = key['date']

        max_seen_cars = collections.defaultdict(lambda: 0)

        for row in rows:
            operator = row['operator']
            max_seen_cars[operator] = max(max_seen_cars[operator], row['n_cars'])

        operators = list(max_seen_cars)

        for operator in operators:
            yield {
                'city_id': city_id,
                'date': date,
                'operator': operator,
                'max_seen_cars': max_seen_cars[operator],
            }


def main():
    yt_config = cars.settings.YT['data']
    yt_config['token'] = os.environ['YT_TOKEN']   
    client = make_yt_client('data')

    log_tables = client.list(yt_config['logs_dir'], absolute=True)
    dst_table = os.path.join(yt_config['base_dir'], 'daily-operator-stats')
    ensure_yt_dir(dirname=os.path.dirname(dst_table), client=client)

    client.run_map_reduce(
        mapper=PreprocessorMapper(),
        reducer=StatsReducer(),
        source_table=log_tables,
        destination_table=dst_table,
        reduce_by=['city_id', 'date'],
        format='json',
        spec={
            'data_size_per_map_job': 128 * yt.common.MB,
        },
    )
    client.run_sort(
        dst_table,
        sort_by=['city_id', 'date', 'operator'],
    )


if __name__ == '__main__':
    main()
