"""
Converts user profile from advisor mongo to a dataframe
of (user-item-features).
"""

import datetime
import json
import logging
import re
from calendar import timegm

import numpy as np
from bson import json_util
from httpgeobase import FED_SUBJECT_TYPE, CITY_TYPE

from jafar_yt.utils.helpers import get_lookuper, force_ascii
from jafar_yt.utils.structarrays import DataFrame, Series

logger = logging.getLogger(__name__)

FEATURE_SETS = (
    # user
    'device',
    'os',
    'lbs',
    'profile_age',
    'crypta',
    'updated_age'
)

REGION_TYPES = {
    FED_SUBJECT_TYPE: 'subject',
    CITY_TYPE: 'city'
}

SINGLE_STARTING_DOLLAR_REGEX = r'^\$(?!\$)'
DOUBLE_STARTING_DOLLAR_REGEX = r'^\$\$(?!\$)'

CRYPTA_AGE = [
    ('0_17', '0'),
    ('18_24', '1'),
    ('25_34', '2'),
    ('35_44', '3'),
    ('45_99', '4')
]
CRYPTA_INCOME = [
    ('low', '0'),
    ('middle', '1'),
    ('high', '2')
]
CRYPTA_GENDER = 'gender'
CRYPTA_LOYALTY = 'loyalty'


def modify_keys(dictionary, pattern, repl):
    result = dictionary.copy()
    for key, value in result.iteritems():
        new_key = re.sub(pattern, repl, key)
        if isinstance(value, dict):
            value = modify_keys(value, pattern, repl)
        result[new_key] = value
        if new_key != key:
            del result[key]
    return result


def mongo_to_yt(document):
    # mongo documents are not json-serializable by default, so we use json_util to handle datetimes
    # and then load back with simple json
    doc = json.loads(json_util.dumps(document))
    # also YT doesn't like keys that start with $, so we replace them by $$
    doc = modify_keys(doc, SINGLE_STARTING_DOLLAR_REGEX, '$$')
    return doc


def yt_to_mongo(record):
    if 'device_id' in record:
        del record['device_id']
    record = modify_keys(record, DOUBLE_STARTING_DOLLAR_REGEX, '$')
    return json_util.loads(json.dumps(record))


def regions_to_string(region_ids):
    return ','.join(map(str, region_ids))


def regions_from_string(region_string):
    return [int(region) for region in region_string.split(',') if region]


def operators_to_string(operators):
    return '|'.join(["%d:%d" % (op['mcc'], op['mnc']) for op in operators])


def operators_from_string(operators_string):
    mcc_mnc_strings = [operator_string.split(':') for operator_string in operators_string.split('|') if operator_string]
    return [{'mcc': int(mcc), 'mnc': int(mnc)} for mcc, mnc in mcc_mnc_strings]


def unique_operators_from_string(operators_string):
    mcc_mnc_strings = [operator_string.split(':') for operator_string in set(operators_string.split('|')) if
                       operator_string]
    return [{'mcc': int(mcc), 'mnc': int(mnc)} for mcc, mnc in mcc_mnc_strings]


class UserProfileConverter(object):
    _geobase_lookuper = None

    class IncompleteProfile(Exception):
        """ Raised when profile has insufficient data to be correctly converted """
        def __init__(self, field):
            self.message = 'Profile has no field "%s"' % field

    def __init__(self, user_profile):
        self.user_profile = user_profile

    @property
    def geobase_lookuper(self):
        if self._geobase_lookuper is None:
            UserProfileConverter._geobase_lookuper = get_lookuper()
        return self._geobase_lookuper

    @property
    def user_id(self):
        return str(self.user_profile['_id'])

    @staticmethod
    def get_system_time(apps):
        """
        some heuristic to find the time when the most system apps were installed
        a"""
        system_times = sorted([app['first_install_time'] for app in apps if app['is_system']])
        count = min(10, len(system_times) / 2)
        if count > 1:
            # find 10 system apps closest to each other by install time
            min_idx = np.argmin(
                [system_times[i + count - 1] - system_times[i] for i in xrange(len(system_times) - count)])
            return np.mean(system_times[min_idx: min_idx + count])
        else:
            return 0.0

    @staticmethod
    def is_user_app(app, system_time):
        return (
            not app.get('is_system') and
            not app.get('is_disabled') and
            # if the application's install time is close to system time, consider it as a preinstalled app
            np.abs(app['first_install_time'] - system_time) > 1 * 24 * 60 * 60
        )

    @staticmethod
    def check_feature_sets(feature_sets):
        assert set(feature_sets).issubset(set(FEATURE_SETS)), 'Only following feature sets are supported: {}'.format(
            FEATURE_SETS)

    def get_installs(self, user_apps_only=True):
        items = []
        if 'installed_apps_info' not in self.user_profile:
            raise self.IncompleteProfile('installed_apps_info')

        installed_apps = self.user_profile.get('installed_apps_info', [])
        if installed_apps is None:
            raise self.IncompleteProfile('installed_apps_info')

        system_time = self.get_system_time(installed_apps)
        for app in installed_apps:
            is_user_app = self.is_user_app(app, system_time)
            if user_apps_only and not is_user_app:
                # ignore system apps
                continue
            items.append((self.user_id, app['package_name'], app['first_install_time'], is_user_app))
        return DataFrame.from_structarray(np.array(items, dtype=self.get_installs_dtype()))

    def get_removed_items(self):
        removed_apps = self.user_profile.get('removed_apps_info', [])
        removed_data = [(self.user_id, app['package_name'], app['removal_ts']) for app in removed_apps]
        return DataFrame.from_structarray(np.array(removed_data, dtype=self.get_removed_items_dtype()))

    def get_disliked_items(self):
        disliked_apps = self.user_profile.get('feedbacks', [])
        disliked_apps_data = [(self.user_id, app['package_name'], app['timestamp']) for app in disliked_apps]
        return DataFrame.from_structarray(np.array(disliked_apps_data, dtype=self.get_disliked_items_dtype()))

    def get_user_features(self, feature_sets=FEATURE_SETS):
        self.check_feature_sets(feature_sets)
        df = DataFrame(np.array([(self.user_id,)], dtype=[('user', np.object)]))
        for name in feature_sets:
            method = getattr(self, 'get_{}_feature_set'.format(name))
            df = method(df)

        for name in df.dtype.names:
            df[name] = force_ascii_if_unicode(df[name])
        dtype = self.get_user_features_dtype(feature_sets)
        return df.astype(dtype)

    @staticmethod
    def get_installs_dtype():
        return [
            ('user', np.object),
            ('item', np.object),
            ('install_time', np.int32),
            ('is_user_app', np.bool)
        ]

    @staticmethod
    def get_removed_items_dtype():
        return [
            ('user', np.object),
            ('item', np.object),
            ('timestamp', np.int32)
        ]

    @staticmethod
    def get_disliked_items_dtype():
        return [
            ('user', np.object),
            ('item', np.object),
            ('timestamp', np.int32)
        ]

    @classmethod
    def get_user_features_dtype(cls, feature_sets=FEATURE_SETS):
        cls.check_feature_sets(feature_sets)
        dtypes = {
            'device': [
                ('device_model', np.object),
                ('device_manufacturer', np.object),
                ('yandex_uid', np.object)
            ],
            'os': [
                ('os_version_number', np.object),
                ('os_api_level', np.object)
            ],
            'lbs': [
                       ('lbs_country', np.object),
                       ('lbs_operators', np.object),
                       ('lbs_cells', np.object),
                       ('lbs_regions', np.object),
                   ] + [
                       ('lbs_region_{}'.format(name), np.float32)
                       for name in REGION_TYPES.values()
                   ],
            'profile_age': [
                ('profile_age', np.float32)
            ],
            'crypta': [
                          ('crypta_age_{}'.format(segment_name), np.float32)
                          for segment_name, segment_code in CRYPTA_AGE
                      ] + [
                          ('crypta_gender_male', np.float32),
                      ] + [
                          ('crypta_income_{}'.format(segment_name), np.float32)
                          for segment_name, segment_code in CRYPTA_INCOME
                      ] + [
                          ('crypta_loyalty', np.float32)
                      ],
            'updated_age': [
                ('updated_age', np.int32)
            ],
        }
        return sum(
            [dtypes[feature_set] for feature_set in feature_sets], []
        ) + [('user', np.object)]

    def get_device_feature_set(self, df):
        """
        Includes model and manufacturer for now
        """
        model = None
        manufacturer = None
        try:
            string_fields = self.user_profile['android_info']['os_build']['string_fields']
        except KeyError:
            raise self.IncompleteProfile('android_info.os_build.string_fields')

        for key_value in string_fields:
            if key_value['key'] == 'MODEL':
                model = key_value['value']
            elif key_value['key'] == 'MANUFACTURER':
                manufacturer = key_value['value']
            if model is not None and manufacturer is not None:
                break
        yandex_uid = self.user_profile.get('passport_uid')

        df = df.append_column(np.array([model]), 'device_model')
        df = df.append_column(np.array([manufacturer]), 'device_manufacturer')
        df = df.append_column(np.array([yandex_uid]), 'yandex_uid')
        return df

    def get_os_feature_set(self, df):
        """
        Includes os version number and API level
        """
        try:
            version_info = self.user_profile['android_info']['os_build']['version']
        except KeyError:
            raise self.IncompleteProfile('android_info.os_build.version')

        version_number = version_info.get('release')
        api_level = version_info.get('sdk_int')

        df = df.append_column(np.array([version_number]), 'os_version_number')
        df = df.append_column(np.array([api_level]), 'os_api_level')
        return df

    def get_lbs_feature_set(self, df):
        """
        Includes mcc, mnc and some of the regions
        """
        lbs_info = self.user_profile.get('lbs_info', {})
        df = df.append_column(np.array([lbs_info.get('country')]), 'lbs_country')

        mcc, mnc = lbs_info.get('mcc'), lbs_info.get('mnc')

        # TODO remove lbs_info.get('operators') and mcc, mnc
        cells_string = operators_to_string(lbs_info.get('cells') or lbs_info.get('operators') or [])
        if not cells_string and mcc and mnc:
            cells_string = "%d:%d" % (mcc, mnc)
        df = df.append_column(np.array([cells_string]), 'lbs_cells')

        operators_string = operators_to_string(self.user_profile.get('operators') or [])
        df = df.append_column(np.array([operators_string]), 'lbs_operators')

        region_ids = lbs_info.get('region_ids', [])
        region_types = lbs_info.get('region_types', [])
        regions_string = regions_to_string(region_ids)
        df = df.append_column(np.array([regions_string]), 'lbs_regions')

        regions = dict(zip(region_types, region_ids))
        for region_type, name in REGION_TYPES.iteritems():
            df = df.append_column(np.array([regions.get(region_type)]), 'lbs_region_{}'.format(name))
        return df

    def get_profile_age_feature_set(self, df):
        """
        Number of days between user's creation and current time.
        """
        if 'created_at' not in self.user_profile:
            raise self.IncompleteProfile('created_at')
        creation_time = timegm(self.user_profile['created_at'].timetuple())
        current_time = timegm(datetime.datetime.utcnow().timetuple())
        df = df.append_column(np.array([(current_time - creation_time) / (24 * 3600)]), 'profile_age')
        return df

    def get_crypta_feature_set(self, df):
        """
        Includes age, gender, income and Yandex loyalty
        """
        crypta = self.user_profile.get('crypta', {})
        # age
        age = crypta.get('age', {})
        for segment_name, segment_code in CRYPTA_AGE:
            df = df.append_column(np.array([age.get(segment_code)]), 'crypta_age_{}'.format(segment_name))

        # gender
        df = df.append_column(np.array([crypta.get(CRYPTA_GENDER)]), 'crypta_gender_male')

        # income
        income = crypta.get('income', {})
        for segment_name, segment_code in CRYPTA_INCOME:
            df = df.append_column(np.array([income.get(segment_code)]), 'crypta_income_{}'.format(segment_name))

        # loyalty
        df = df.append_column(np.array([crypta.get(CRYPTA_LOYALTY)]), 'crypta_loyalty')
        return df

    def get_updated_age_feature_set(self, df):
        if 'updated_at' not in self.user_profile:
            raise self.IncompleteProfile('updated_at')
        update_time = timegm(self.user_profile['updated_at'].timetuple())
        current_time = timegm(datetime.datetime.utcnow().timetuple())
        df = df.append_column(np.array([(current_time - update_time) / (24 * 3600)]), 'updated_age')
        return df


def force_ascii_if_unicode(series):
    """
    Converts to utf-8 and then to ascii by ignoring all non-ascii symbols.
    If value is not unicode, does nothing.
    """
    return Series(np.vectorize(force_ascii)(series.to_array()))
