#!/usr/bin/env python
# coding: utf-8

import argparse
import boto3
from collections import defaultdict
import copy
from decimal import *
import logging
from multiprocessing.pool import Pool
from operator import itemgetter
import os
import pandas as pd


def get_module_logger(severity, name):
    logger_format = "time=%(asctime)s name=%(name)s msg=%(message)s"
    date_format = "%Y-%m-%dT%H:%M:%SZ"

    logger = logging.getLogger(name)

    logger.propagate = False
    formatter = logging.Formatter(logger_format, datefmt=date_format)

    handler = logging.StreamHandler()
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    logger.setLevel(severity)

    return logger


logger = get_module_logger('INFO', 'KB LOADER')


class LoadChannelSim(object):
    def __init__(self, stats_table, small_channels_table):
        self.stats_table = stats_table
        self.small_channels_table = small_channels_table

        self.store_region = "us-west-2"
        self.stats_header = [
            'channel_a',
            'channel_b',
            'mw_intersect',
            'mw_union',
            'mw_jaccard',
            'chat_intersect',
            'chat_union',
            'chat_jaccard',
            'cheer_intersect',
            'cheer_union',
            'cheer_jaccard',
            'fmp_intersect',
            'fmp_union',
            'fmp_jaccard',
            'fmp_viewer_intersect',
            'fmp_viewer_union',
            'fmp_viewer_jaccard',
            'follow_intersect',
            'follow_union',
            'follow_jaccard',
            'host_intersect',
            'host_union',
            'host_jaccard',
            'raid_intersect',
            'raid_union',
            'raid_jaccard',
            'sub_intersect',
            'sub_union',
            'sub_jaccard',
            'total_sim',

        ]

        self.small_channels_header = ["channel"]
        self.parallelism_level = 15
        self.max_similar_channels = 800

    @staticmethod
    def _create_dynamo_sim_item(channel_name, row):
        item = {
            'channel_name': str(channel_name),
            'channels': row
        }

        return item

    def _create_trimmed_dynamo_sim_item(self, channel_name, row):
        new_row = sorted(row, key=itemgetter('total_sim'), reverse=True)[0:self.max_similar_channels]

        item = self._create_dynamo_sim_item(channel_name, new_row)
        return item

    @staticmethod
    def _generate_stats_json(row):
        stats = {}
        if int(row.mw_intersect) > 0:
            stats["mw"] = {"intersect": int(row.mw_intersect), "union": int(row.mw_union),
                           "jaccard": Decimal(str(float(row.mw_jaccard)))}
        if int(row.chat_intersect) > 0:
            stats["chat"] = {"intersect": int(row.chat_intersect), "union": int(row.chat_union),
                             "jaccard": Decimal(str(float(row.chat_jaccard)))}
        if int(row.cheer_intersect) > 0:
            stats["cheer"] = {"intersect": int(row.cheer_intersect), "union": int(row.cheer_union),
                              "jaccard": Decimal(str(float(row.cheer_jaccard)))}
        if int(row.fmp_intersect) > 0:
            stats["fmp"] = {"intersect": int(row.fmp_intersect), "union": int(row.fmp_union),
                            "jaccard": Decimal(str(float(row.fmp_jaccard)))}
        if int(row.fmp_viewer_intersect) > 0:
            stats["fmp_viewer"] = {"intersect": int(row.fmp_viewer_intersect), "union": int(row.fmp_viewer_union),
                                   "jaccard": Decimal(str(float(row.fmp_viewer_jaccard)))}
        if int(row.follow_intersect) > 0:
            stats["follow"] = {"intersect": int(row.follow_intersect), "union": int(row.follow_union),
                               "jaccard": Decimal(str(float(row.follow_jaccard)))}
        if int(row.host_intersect) > 0:
            stats["host"] = {"intersect": int(row.host_intersect), "union": int(row.host_union),
                             "jaccard": Decimal(str(float(row.host_jaccard)))}
        if int(row.raid_intersect) > 0:
            stats["raid"] = {"intersect": int(row.raid_intersect), "union": int(row.raid_union),
                             "jaccard": Decimal(str(float(row.raid_jaccard)))}
        if int(row.sub_intersect) > 0:
            stats["sub"] = {"intersect": int(row.sub_intersect), "union": int(row.sub_union),
                            "jaccard": Decimal(str(float(row.sub_jaccard)))}

        json_item = \
            {"channel_name": str(row.channel_b), "total_sim": Decimal(str(float(row.total_sim))), "stats": stats}
        return json_item

    def _load_stats_chunk(self, channel_sim_chunk):
        dynamodb = boto3.resource('dynamodb', region_name=self.store_region)
        table = dynamodb.Table(self.stats_table)
        logger.info("# of records in chunk: " + str(len(channel_sim_chunk)))

        with table.batch_writer() as batch_connection:
            for channel_id, row in channel_sim_chunk.items():
                item = self._create_trimmed_dynamo_sim_item(channel_id, row)
                batch_connection.put_item(Item=item)
        logger.info("Finished loading # of records: " + str(len(channel_sim_chunk)))

    def load_stats(self, s3_data_bucket, s3_filepath, demo_run=True):
        # Fetch S3 input files to be loaded
        s3 = boto3.resource('s3')
        input_bucket = s3.Bucket(s3_data_bucket)

        logger.info('Saving STATS output to DynamoDB')

        # Getting all csv files with records; Each record contains channel_a -> channel_b similarity stats
        files = input_bucket.objects.filter(Prefix=s3_filepath)
        from_locations = [os.path.join('s3://', s3_data_bucket, s3_file.key) for s3_file in files]
        if demo_run:
            from_locations = from_locations[0:5]
        logger.info("Total number of files to load: " + str(len(from_locations)))

        # Create a map for all similar stats a channel_a has with channels
        # i.e. channel_a -> list[{map of sim stats with channel_b}]
        channel_sim = defaultdict(list)
        for from_location in from_locations:
            logger.info('Downloading: ' + from_location + ' to DynamoDB')
            if "manifest" in from_location:
                continue
            records = pd.read_csv(from_location, sep='|', header=None, names=self.stats_header, index_col=False)

            for row in records.itertuples():
                json_item = self._generate_stats_json(row)
                channel_sim[row.channel_a].append(json_item)

        logger.info("Total channels: " + str(len(channel_sim.items())))

        # Create a list of maps to send for parallel processing
        list_of_chunks = []
        stats_chunk_size = 100
        iterator_num = 0
        chunk_dict = defaultdict(list)

        for key, value in channel_sim.items():
            iterator_num += 1
            chunk_dict[key] = value
            if iterator_num >= stats_chunk_size:
                iterator_num = 0
                ready_dict = copy.deepcopy(chunk_dict)
                list_of_chunks.append(ready_dict)
                chunk_dict = defaultdict(list)
        list_of_chunks.append(chunk_dict)

        # Load each chunk in parallel in DynamoDB
        logger.info("Total chunks to load in parallel: " + str(len(list_of_chunks)))
        with Pool(self.parallelism_level) as pool:
            pool.map(self._load_stats_chunk, list_of_chunks, chunksize=1)

    def _load_small_channels_file(self, from_location):
        if "manifest" in from_location:
            return
        dynamodb = boto3.resource('dynamodb', region_name=self.store_region)
        table = dynamodb.Table(self.small_channels_table)

        records = pd.read_csv(from_location, sep='|', header=None, names=self.small_channels_header, index_col=False)

        logger.info("Total number of records: " + str(len(records)))
        with table.batch_writer() as batch_connection:
            for row in records.itertuples():
                json_item = {"channel_name": str(row.channel), "small_channel": True}
                batch_connection.put_item(Item=json_item)
        logger.info("Finished batch for file: " + from_location)

    def load_small_channels(self, s3_data_bucket, s3_filepath, demo_run=True):
        # Fetch S3 input files to be loaded
        logger.info('Loading input')
        s3 = boto3.resource('s3')
        input_bucket = s3.Bucket(s3_data_bucket)

        logger.info('Saving SMALL CHANNELS output DynamoDB')

        files = input_bucket.objects.filter(Prefix=s3_filepath)
        from_locations = [os.path.join('s3://', s3_data_bucket, s3_file.key) for s3_file in files]
        if demo_run:
            from_locations = from_locations[0:5]
        logger.info("Total number of files to load: " + str(len(from_locations)))

        with Pool(self.parallelism_level) as pool:
            pool.map(self._load_small_channels_file, from_locations, chunksize=100)

    def profile_store(self):
        dynamodb = boto3.client('dynamodb')
        stats_table = dynamodb.describe_table(
            TableName=self.stats_table
        )
        logger.info("Stats table stats: " + str(stats_table))

        small_channels_table = dynamodb.describe_table(
            TableName=self.small_channels_table
        )
        logger.info("Small Channels table stats: " + str(small_channels_table))


DYNAMO_STATS_STORE_TABLE = "kevin-bacon-channel-sim-full" # "kevin-bacon-channel-sim-full-v2"
DYNAMO_SMALL_CH_STORE_TABLE = "kevin-bacon-small-channels"

'''
Sample call:
kb_load_data.py --s3_bucket kevin-bacon-channel-sim --s3_stats_path data/kb_sim_wide_agg_25ccu --s3_small_ch_path data/partst01sim
'''

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Data loader for channel-similarity-service')
    parser.add_argument('--s3_bucket', help='s3 bucket for data ingest', type=str, required=True)
    parser.add_argument('--s3_stats_path', help='path to dataset for channel similarity stats', type=str, required=True)
    parser.add_argument('--s3_small_ch_path', help='path to dataset for small channels list', type=str, required=True)
    parser.add_argument('--demo_run', help='test run on a small dataset', type=bool, required=False, default=True)

    args = parser.parse_args()

    data_loader = LoadChannelSim(DYNAMO_STATS_STORE_TABLE, DYNAMO_SMALL_CH_STORE_TABLE)
    data_loader.load_stats(args.s3_bucket, args.s3_stats_path, args.demo_run)
    data_loader.load_small_channels(args.s3_bucket, args.s3_small_ch_path, args.demo_run)
    data_loader.profile_store()
