import argparse
import datetime
import json
import os
import time
from dateutil.parser import parse as parse_iso8601

import yt.wrapper as yt


LOGS_BASE = '//home/logfeller/logs/taxi-logistic-dispatcher-yandex-taxi-logistic-dispatcher-log/1d'
FREEZE_LOGS_BASE = '//home/taxi/testing/export/taxi-logistic-dispatcher-production/route_propositions_history'
PROFILE_CLEANUPS_ROUTES = '//home/taxi/testing/export/taxi-logistic-dispatcher-production/contractor_profiles_history'

IGNORABLE_GAMBLE_VERDICTS = [
    'too-heavy-edge',
    'other-proposition'
]

ATTACH_CLEANUP_DATA_REASONS = [
    'payment-methods-mismatch',
    'special-requirements-mismatch',
    'no-class-intersection',
]

class IdentityMapper:

    def __call__(self, row):
        yield row


class SimpleOneOfMapper:

    def __init__(self, **kwargs):
        self._conditions = kwargs

    def __call__(self, row):
        match = True
        for field, values in self._conditions.items():
            if field not in row:
                match = False
                break
            if row[field] not in values:
                match = False
                break
        if match:
            yield row


class LogEntrySegmentDetector:

    def __call__(self, row):
        cargo_ref_id = row.get('cargo_ref_id')
        module = row.get('module')
        source = row.get('source')
        uri = row.get('uri')
        data = None
        skip_output = False
        if cargo_ref_id and module == 'ServiceApi' and uri == '/order-search' and row.get('response') is not None:
            data = self._parse_candidates_info(row)
        elif cargo_ref_id and module == 'ServiceApi' and source == 'segments_journal' and uri == '/v1/segment/info' and row.get('response') is not None:
            data = self._parse_segment_data(row)
        elif module == 'ServiceApi' and source == 'operator_commands_executor' and uri == '/v1/waybill/propose' and row.get('request') is not None:
            for data in self._parse_propose_request(row):
                yield data
            skip_output = True
        elif cargo_ref_id and module == 'planner-segment-edges':
            data = self._parse_edges_info(row)
        elif cargo_ref_id and module == 'planner-assignment':
            data = self._parse_planner_assignment_info(row)
        if data and not skip_output:
            data['segment_id'] = cargo_ref_id
            yield data

    def _parse_propose_request(self, row):
        json_request = row['request']
        json_request = json_request[json_request.find(' '):]
        request = json.loads(json_request)
        for segment in request['segments']:
            yield {
                'type': 'propose_request',
                'timestamp': int(row['unixtime']),
                'propose_api_request': request,
                'segment_id': segment['segment_id'],
            }

    def _parse_segment_data(self, row):
        employer_code = None

        # Segment data is not JSON parsable, so we need to use primitive methods for data extraction
        # The code below is complete garbage, don't read it

        def extract_entity(log, field_name, end_symbol):
            prefix = '"' + field_name + '":"'
            if prefix not in log:
                return None
            suffix_start_position = log.find(prefix) + len(prefix)
            suffix = log[suffix_start_position:]
            suffix_end_position = suffix.find(end_symbol)
            return suffix[:suffix_end_position]

        def extract_string(log, field_name):
            return extract_entity(log, field_name, '"')

        def extract_array(log, field_name):
            return extract_entity(log, field_name, ']')

        employer_code = extract_string(row['response'], 'employer')
        router_id = extract_string(row['response'], 'router_id')
        corp_client_id = extract_string(row['response'], 'corp_client_id')
        due = extract_string(row['response'], 'due')
        zone_id = extract_string(row['response'], 'zone_id')
        tariff_substitution = extract_array(row['response'], 'tariffs_substitution')

        time_to_due = None
        if due is not None:
            due_dt = parse_iso8601(due)
            lookup_started_at = extract_string(row['response'], 'lookup_started_at')
            creation_dt = parse_iso8601(lookup_started_at)
            time_to_due = (due_dt - creation_dt).total_seconds()

        return {
            'type': 'segment_info',
            'timestamp': int(row['unixtime']),
            'employer_code': employer_code,
            'router_id': router_id,
            'corp_client_id': corp_client_id,
            'zone_id': zone_id,
            'time_to_due': time_to_due,
            'tariffs_substitution': tariffs_substitution,
        }

    def _parse_candidates_info(self, row):
        try:
            response = json.loads(row['response'])
        except Exception:
            return None
        if 'candidates' not in response:
            return None
        available_candidates = []
        for candidate in response['candidates']:
            candidate_id = candidate['id']
            route_info = candidate['route_info']
            transport = candidate['transport']
            candidate_info = {
                'candidate_id': candidate_id,
                'route_info': route_info,
                'transport': transport,
            }
            available_candidates.append(candidate_info)
        return {
            'type': 'candidates',
            'timestamp': int(row['unixtime']),
            'available_candidates': available_candidates
        }

    def _parse_edges_info(self, row):
        edges = json.loads(row['text'])['edges']
        available_candidates = []
        for edge in edges:
            contractor_id = edge['driver_id']
            kind = edge['kind']
            available_candidates.append(
                {
                    'contractor_id': contractor_id,
                    'kind': kind
                }
            )
        return {
            'type': 'edges',
            'timestamp': int(row['unixtime']),
            'available_candidates': available_candidates
        }

    def _parse_planner_assignment_info(self, row):
        contractor_id = row['driver_id']
        return {
            'type': 'assignment',
            'timestamp': int(row['unixtime']),
            'contractor_id': contractor_id
        }


class SegmentTraceReducer:

    def __init__(self, freeze_data, profile_cleanups_data):
        self._freeze_data = freeze_data
        self._profile_cleanups_data = profile_cleanups_data

    def __call__(self, segment_id, rows):
        n_candidates_requests = 0
        n_times_assigned = 0
        n_candidates_in_reply = 0
        n_gambles = 0

        suspicious_entries = []
        last_candidates_timestamp = 0
        employer_code = None
        zone_id = None
        router_id = None
        time_to_due = None
        corp_client_id = None
        reject_reasons_count = {}
        tariffs_substitution = None

        n_propose_requests = 0
        n_total_edges = 0

        last_candidates = {}

        for row in rows:
            if row['type'] == 'assignment':
                n_times_assigned += 1
                continue
            if row['type'] == 'segment_info':
                employer_code = employer_code or row['employer_code']
                router_id = row['router_id']
                corp_client_id = corp_client_id or row['corp_client_id']
                time_to_due = time_to_due or row['time_to_due']
                zone_id = zone_id or row['zone_id']
                tariffs_substitution = tariffs_substitution or row['tariffs_substitution']
                continue
            if row['type'] == 'propose_request':
                n_propose_requests += 1
                continue
            if row['type'] == 'candidates':
                n_candidates_requests += 1
                last_candidates_timestamp = row['timestamp']
                n_candidates_in_reply += len(last_candidates)
                for candidate in row['available_candidates']:
                    candidate_id = candidate['candidate_id']
                    meta = {
                        'route_info': candidate['route_info'],
                        'transport': candidate['transport'],
                    }
                    last_candidates[candidate_id] = meta
                continue
            if row['type'] == 'edges' and last_candidates_timestamp > 0:
                n_gambles += 1
                n_total_edges += len(row['available_candidates'])
                reject_reasons = {}
                for record in row['available_candidates']:
                    reject_reasons[record['contractor_id']] = record['kind']
                for c, meta in last_candidates.items():
                    reject_reason = reject_reasons.get(c)
                    if not reject_reason:
                        reject_reason = self._get_freeze_data(c, last_candidates_timestamp)
                    if reject_reason in IGNORABLE_GAMBLE_VERDICTS:
                        continue
                    if reject_reason not in reject_reasons_count:
                        reject_reasons_count[reject_reason] = 0
                    reject_reasons_count[reject_reason] += 1
                    if len(suspicious_entries) >= 100:
                        continue
                    report = {
                        'candidate_id': c,
                        'lag': row['timestamp'] - last_candidates_timestamp,
                        'kind': reject_reason,
                        'meta': meta,
                    }
                    if reject_reason == 'regular':
                        report['freeze_data'] = self._get_freeze_data(c, row['timestamp'])
                    if reject_reason in ATTACH_CLEANUP_DATA_REASONS:
                        report['profile_cleanup'] = self._get_profile_cleanup_data(c, row['timestamp'])
                    suspicious_entries.append(report)
                last_candidates = {}

        most_popular_reject_reason = None
        for reason, count in reject_reasons_count.items():
            if reason is None:
                continue
            if not most_popular_reject_reason or count > reject_reasons_count[most_popular_reject_reason]:
                most_popular_reject_reason = reason

        if router_id:
            yield {
                'suspicious_entries': suspicious_entries,
                'n_gambles': n_gambles,
                'n_candidates_requests': n_candidates_requests,
                'n_total_edges': n_total_edges,
                'n_candidates_in_reply': n_candidates_in_reply,
                'n_times_assigned': n_times_assigned,
                'employer_code': employer_code,
                'zone_id': zone_id,
                'router_id': router_id,
                'corp_client_id': corp_client_id,
                'time_to_due': time_to_due,
                'most_popular_reject_reason': most_popular_reject_reason,
                'segment_id': segment_id['segment_id'],
                'propose_rejections': reject_reasons_count.get(None, 0),
                'n_propose_requests': n_propose_requests,
                'tariffs_substitution': tariffs_substitution,
            }


    def _get_freeze_data(self, contractor_id, timestamp):
        for freeze_entry in self._freeze_data.get(contractor_id, []):
            if freeze_entry.contains(timestamp):
                return freeze_entry.get_report()
        return None

    def _get_profile_cleanup_data(self, contractor_id, timestamp):
        for profile_cleanup_time in self._profile_cleanups_data.get(contractor_id, []):
            low_timestamp = profile_cleanup_time - 60
            high_timestamp = profile_cleanup_time + 60
            if low_timestamp <= timestamp and timestamp <= high_timestamp:
                return True
        return False

    def _get_event_priority(self, event_type):
        if event_type == 'candidates':
            return -1
        if event_type == 'edges':
            return 0
        return 1


class FreezeConstructionReducer:

    def __call__(self, key, rows):
        last_row_contractor = None
        contractor_start = None
        proposition_seq_idx = 0
        for row in rows:
            if row['contractor_id'] == '':
                row['contractor_id'] = None
            last_timestamp = int(row['history_timestamp'])
            if (not row['contractor_id']) or (last_row_contractor and row['contractor_id'] != last_row_contractor):
                if last_row_contractor:
                    yield {
                        'contractor_id': last_row_contractor,
                        'freeze_start': contractor_start,
                        'freeze_end': int(row['history_timestamp']),
                        'prop_seq_id': proposition_seq_idx
                    }
                last_row_contractor = row['contractor_id']
                if last_row_contractor:
                    proposition_seq_idx += 1
                    contractor_start = int(row['history_timestamp'])
            elif last_row_contractor is None:
                if not row['contractor_id']:
                    continue
                last_row_contractor = row['contractor_id']
                contractor_start = int(row['history_timestamp'])
                proposition_seq_idx += 1
        if last_row_contractor:
            yield {
                'contractor_id': last_row_contractor,
                'freeze_start': contractor_start,
                'freeze_end': last_timestamp,
                'prop_seq_id': proposition_seq_idx
            }


class ProfileCleanupsMapper:

    def __call__(self, row):
        if row['history_action'] == 'remove':
            yield {
                'timestamp': int(row['history_timestamp']),
                'contractor_id': row['operator_contractor_id'],
            }


class FetcherErrorsMapper:

    def __call__(self, row):
        if row.get('module') == 'TModelFetcher' and row.get('text') == 'Contractor busy, but not for candidates' and row.get('driver_id') is not None:
            yield {
                'timestamp': int(row['unixtime']),
                'contractor_id': row['driver_id'],
            }


class FreezeInfo:

    def __init__(self, row):
        self.contractor_id = row['contractor_id']
        self.freeze_start = row['freeze_start']
        self.freeze_end = row['freeze_end']
        self.prop_seq_id = row['prop_seq_id']

    @property
    def is_parasite(self):
        return self.prop_seq_id > 10

    def contains(self, timestamp):
        return self.freeze_start - 90 <= timestamp and timestamp <= self.freeze_end + 90

    def get_report(self):
        return 'parasite-proposition({})'.format(self.prop_seq_id) if self.is_parasite else 'other-proposition'


def main():
    parser = argparse.ArgumentParser(
        description='Trace the destiny of all delivery orders within a day and find \
        the cases where dispatch did something weird'
    )
    parser.add_argument('--dates', type=str, required=True, help='Dates in ISO format')
    args = parser.parse_args()

    mapper = LogEntrySegmentDetector()
    table_routes = []
    freeze_table_routes = []
    profile_cleanups_routes = []
    for date in args.dates.split(','):
        table_routes.append(os.path.join(LOGS_BASE, date))
        freeze_table_routes.append(os.path.join(FREEZE_LOGS_BASE, date))
        profile_cleanups_routes.append(os.path.join(PROFILE_CLEANUPS_ROUTES, date))

    aux_table_route = '//home/taxi/home/skulik/cr-analytics/eventlog-events'
    yt.run_map(mapper, table_routes, aux_table_route, spec={'combine_chunks': True, 'data_size_per_job': 1535852842})
    yt.run_sort(aux_table_route, sort_by=['segment_id', 'timestamp'])

    aux_freeze_table_route = '//tmp/zxqfd555-eventlog-events-raw-united'
    freeze_identity_mapper = IdentityMapper()
    #yt.run_map(freeze_identity_mapper, freeze_table_routes, aux_freeze_table_route)
    #yt.run_sort(aux_freeze_table_route, sort_by=['proposition_id', 'history_timestamp'])
    freeze_reducer = FreezeConstructionReducer()
    freeze_final_table_route = '//home/taxi/home/skulik/cr-analytics/freeze-events'
    #yt.run_reduce(freeze_reducer, aux_freeze_table_route, freeze_final_table_route, reduce_by=['proposition_id'])

    freeze_data = {}
    for entry in yt.read_table(freeze_final_table_route, format='yson'):
        freeze_info = FreezeInfo(entry)
        if freeze_info.contractor_id not in freeze_data:
            freeze_data[freeze_info.contractor_id] = []
        freeze_data[freeze_info.contractor_id].append(freeze_info)

    profile_cleanups_mapper = ProfileCleanupsMapper()
    aux_profile_cleanups_route = '//home/taxi/home/skulik/cr-analytics/profile-cleanup-events'
    #yt.run_map(profile_cleanups_mapper, profile_cleanups_routes, aux_profile_cleanups_route)
    profile_cleanups_data = {}
    for entry in yt.read_table(aux_profile_cleanups_route, format='yson'):
        contractor_id = entry['contractor_id']
        timestamp = entry['timestamp']
        if contractor_id not in profile_cleanups_data:
            profile_cleanups_data[contractor_id] = []
        profile_cleanups_data[contractor_id].append(timestamp)

    timelines_route = '//home/taxi/home/skulik/cr-analytics/timelines'
    reducer = SegmentTraceReducer(freeze_data, profile_cleanups_data)
    yt.run_reduce(reducer, aux_table_route, timelines_route, reduce_by=['segment_id'])


if __name__ == '__main__':
    # main()

    def extr(module, resp, text, uri):
        available_candidates = []
        # if module == 'planner-segment-edges':
        #     edges = json.loads(text)['edges']
        #     for edge in edges:
        #         contractor_id = edge['driver_id']
        #         kind = edge['kind']
        #         available_candidates.append(
        #             {
        #                 'contractor_id': contractor_id,
        #                 'kind': kind
        #             }
        #         )
        if (module == 'ServiceApi') and (resp is not None) and (
                uri == '/order-search'):
            response = json.loads(resp)
            if 'candidates' not in response:
                return None

            for candidate in response['candidates']:
                candidate_id = candidate['id']
                available_candidates.append(candidate_id)
        return available_candidates


    from projects.efficiency_metrics.project_config import get_project_cluster

    cluster = get_project_cluster()

    job = cluster.job('Couriers collection' + str(time.time()))
    job = job.env(
        bytes_decode_mode='strict',
        yt_spec_defaults={'max_failed_job_count': 1000}
    )



