from nile.api.v1 import (
    Record,
    filters as nf,
    extractors as ne,
    aggregators as na,
)

from qb2.api.v1 import typing as qt
from qb2.api.v1 import filters as qf

from projects.common.nile import test_utils, dates
from projects.cargo.b2b.claims_merge import project_config

from datetime import datetime

import numpy as np

# Исходные таблицы
CLAIMS_PATH = '//home/taxi/production/replica/postgres/cargo_claims/claims'
CLAIMS_POINTS_PATH = (
    '//home/taxi/production/replica/postgres/cargo_claims/claim_points'
)
POINTS_PATH = '//home/taxi/production/replica/postgres/cargo_claims/points'
DM_ORDER_PATH = '//home/taxi-dwh/summary/dm_order/{}'

# Константы, чтобы правилно работать со временем
START_DATE = '2020-06-01'
END_DATE = datetime.today().strftime('%Y-%m-%d')
DATA_START = datetime.strptime(START_DATE, '%Y-%m-%d')
DATA_END = datetime.strptime(END_DATE, '%Y-%m-%d')
DAY_LENGTH = 24 * 60 * 60

# Место и схема, где будет храниться финальная витрина
TABLE = (
    '//home/taxi-delivery/analytics/production/cargo/b2b/claims_merge/b2b_claims_w_clones_reduced'
)

FIRST_SCHEMA_PYTHON = {
    'claim_id': qt.type_from_simple_description(int),
    'clones': qt.type_from_simple_description([str]),
    'corp_client_id': qt.type_from_simple_description(str),
    'corp_client_name': qt.type_from_simple_description(str),
    'created_ts': qt.type_from_simple_description(float),
    'lat_list': qt.type_from_simple_description([float]),
    'lon_list': qt.type_from_simple_description([float]),
    'order_id': qt.type_from_simple_description(str),
    'success_order_flg': qt.type_from_simple_description(bool),
    'utc_order_dt': qt.type_from_simple_description(str),
    'utc_start_transporting_dttm': qt.type_from_simple_description(str),
    'utc_start_waiting_dttm': qt.type_from_simple_description(str)
}

SCHEMA_PYTHON = {
    'corp_client_id': qt.type_from_simple_description(str),
    'corp_client_name': qt.type_from_simple_description(str),
    'has_driver': qt.type_from_simple_description(int),
    'has_driver_share': qt.type_from_simple_description(float),
    'orders_cnt': qt.type_from_simple_description(int),
    'start_transporting': qt.type_from_simple_description(int),
    'start_transporting_share': qt.type_from_simple_description(float),
    'success_cnt': qt.type_from_simple_description(int),
    'success_share': qt.type_from_simple_description(float),
    'utc_order_dt': qt.type_from_simple_description(str),
}

# Первый редьюсер, который позволит собрать в один массив все координаты точек для конкретного claim_id
def claim_reducer(groups):
    for key, records in groups:
        records_list = []
        claim_id = key.get(b'claim_id')
        for record in records:
            records_list.append(test_utils.to_string(record.to_dict()))

        lat_list = []
        lon_list = []

        for record in records_list:
            created_ts = record['created_ts']
            corp_client_id = record['corp_client_id']
            order_id = record['order_id']
            lat_list.append(record['latitude'])
            lon_list.append(record['longitude'])
            success_order_flg = record['success_order_flg']
            utc_start_waiting_dttm = record['utc_start_waiting_dttm']
            utc_start_transporting_dttm = record['utc_start_transporting_dttm']
            utc_order_dt = record['utc_order_dt']
            corp_client_name = record['corp_client_name']

        record_dict = {
            'lat_list': lat_list,
            'lon_list': lon_list,
            'created_ts': created_ts,
            'corp_client_id': corp_client_id,
            'corp_client_name': corp_client_name,
            'order_id': order_id,
            'claim_id': claim_id,
            'order_id': order_id,
            'success_order_flg': success_order_flg,
            'utc_start_waiting_dttm': utc_start_waiting_dttm,
            'utc_start_transporting_dttm': utc_start_transporting_dttm,
            'utc_order_dt': utc_order_dt,
        }
        yield Record.from_dict(record_dict)


# Второй редьюсер, который позволяет для каждого corp_client_id искать похожие claim_id
# Пхожие claim_id склеиваются к наиболее актуальному claim_id
def corp_reducer(groups):
    for key, records in groups:
        corp_id = key.get(b'corp_client_id')
        records_list = []
        for record in records:
            records_list.append(test_utils.to_string(record.to_dict()))

        records_list = list(reversed(records_list))
        all_clones = []

        for i in range(len(records_list)):
            current_claim = records_list[i]

            if current_claim['claim_id'] not in all_clones:
                current_ts = current_claim['created_ts']
                current_lat = current_claim['lat_list']
                current_lon = current_claim['lon_list']
                current_claim['clones'] = []

                continue_flg = True
                j = 1
                while continue_flg and i + j < len(records_list):
                    next_claim = records_list[i + j]
                    j += 1

                    next_ts = next_claim['created_ts']
                    next_lat = next_claim['lat_list']
                    next_lon = next_claim['lon_list']

                    if current_ts - DAY_LENGTH > next_ts:
                        continue_flg = False
                    else:
                        if are_point_lists_similar(
                                current_lat, current_lon, next_lat, next_lon,
                        ):
                            current_claim['clones'].append(
                                next_claim['claim_id'],
                            )
                            all_clones.append(next_claim['claim_id'])

                current_claim['corp_client_id'] = corp_id
                yield Record.from_dict(current_claim)


# Самый последний редьюсер, который просто собирает аккуратную витрину, считая все метрики
def final_reducer(groups):
    for key, records in groups:
        utc_order_dt = key.get(b'utc_order_dt')
        corp_id = key.get(b'corp_client_id')
        records_list = []
        for record in records:
            records_list.append(test_utils.to_string(record.to_dict()))

        if len(records_list) != 0:
            orders_cnt = len(records_list)
            has_driver = 0
            start_transporting = 0
            success_cnt = 0

            for record in records_list:
                if record['success_order_flg']:
                    success_cnt += 1
                if record['utc_start_waiting_dttm'] != None:
                    has_driver += 1
                if record['utc_start_transporting_dttm'] != None:
                    start_transporting += 1
                corp_client_name = record['corp_client_name']

            has_driver_share = has_driver / orders_cnt
            start_transporting_share = start_transporting / orders_cnt
            success_share = success_cnt / orders_cnt

            record_dict = {
                'utc_order_dt': utc_order_dt,
                'corp_client_id': corp_id,
                'corp_client_name': corp_client_name,
                'orders_cnt': orders_cnt,
                'has_driver': has_driver,
                'has_driver_share': has_driver_share,
                'start_transporting': start_transporting,
                'start_transporting_share': start_transporting_share,
                'success_cnt': success_cnt,
                'success_share': success_share,
            }

            yield Record.from_dict(record_dict)


# Вспомогательная функция, которая сравнивает две координаты с точностью пару метров
def are_numbers_equal(x, y):
    error = 0.000001
    return abs(x - y) <= error


# Фнукция, которая определеяет, являются ли два claim_id одинаковыми на основе точек,
# которые в этих claim_id указаны
def are_point_lists_similar(current_lat, current_lon, next_lat, next_lon):
    equal_rate = 0.8

    max_len = max(
        len(current_lat), len(current_lon), len(next_lat), len(next_lon),
    )

    matches = 0

    for i in range(len(current_lat)):
        for j in range(len(next_lat)):
            cur_lat = current_lat[i]
            cur_lon = current_lon[i]
            n_lat = next_lat[j]
            n_lon = next_lon[j]
            if are_numbers_equal(cur_lat, n_lat) and are_numbers_equal(
                    cur_lon, n_lon,
            ):
                matches += 1

    return matches >= (equal_rate * max_len)


# Сам код, который преобразует исходные таблички в нужную мне
if __name__ == '__main__':
    cluster = project_config.get_project_cluster(parallel_operations_limit=10)
    job = cluster.job()

    # Линк между claim_id и point_id
    table_claims_points = job.table(CLAIMS_POINTS_PATH)
    claims_points = table_claims_points.filter(qf.defined('claim_id')).project(
        'claim_id', 'point_id', 'type', 'visit_status',
    )

    # Данные по точкам
    table_points = job.table(POINTS_PATH)
    points = table_points.project(
        point_id='id', latitude='latitude', longitude='longitude',
    )

    # Данные по заявкам
    table_claims = job.table(CLAIMS_PATH)
    claims = (
        table_claims.filter(
            qf.defined('corp_client_id'),
            qf.defined('taxi_order_id'),
            nf.not_(nf.equals('taxi_order_id', b'')),
            nf.and_(
                nf.custom(lambda a: a >= DATA_START.timestamp(), 'created_ts'),
                nf.custom(lambda a: a < DATA_END.timestamp(), 'created_ts'),
            ),
        ).project(
            claim_id='id',
            order_id='taxi_order_id',
            created_ts='created_ts',
            corp_client_id='corp_client_id',
        )
    )

    # Основная витрина по заказам
    table_dm = job.table(
        DM_ORDER_PATH.format(
            dates.range_selector(START_DATE, END_DATE, '%Y-%m'),
        ),
    )
    orders = (
        table_dm.filter(
            nf.equals('fake_order_flg', False),
            nf.equals('fraud_order_flg', False),
            nf.equals('mqc_order_flg', False),
            nf.or_(
                nf.equals('order_tariff', b'cargo'),
                nf.equals('order_tariff', b'cargocorp'),
            ),
        ).project(
            order_id='order_id',
            corp_client_name='corp_client_name',
            success_order_flg='success_order_flg',
            utc_order_dt='utc_order_dt',
            utc_start_waiting_dttm='utc_start_waiting_dttm',
            utc_start_transporting_dttm='utc_start_transporting_dttm',
        )
    )

    # Джойны таблиц выше и их преобразования
    join = (
        claims.join(claims_points, by='claim_id', type='left')
        .join(points, by='point_id', type='left')
        .filter(
            nf.equals('type', b'destination'),
            qf.defined('latitude'),
            qf.defined('longitude'),
        )
        .join(orders, by='order_id', type='inner')
        .groupby('claim_id')
        .reduce(claim_reducer, intensity='ultra_cpu')
        .groupby('corp_client_id')
        .sort('created_ts')
        .reduce(corp_reducer, intensity='ultra_cpu')
        .put('//home/taxi-delivery/analytics/production/cargo/b2b/claims_merge/b2b_claims_w_clones', schema=FIRST_SCHEMA_PYTHON)
        .groupby('corp_client_id', 'utc_order_dt')
        .reduce(final_reducer, intensity='ultra_cpu')
        .put(TABLE + '_schema', schema=SCHEMA_PYTHON)
    )

    job.run()
