import sys
import json
import itertools
import re
from datetime import datetime, timedelta


iso_eventtime_regexp = re.compile(r'\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2}')

file_with_hash_regexp = re.compile(r'File with hash (?P<file_id>[^ ]+) already processed to (?P<target_type>[^ ]+).*')

on_convert_begin_regexp = re.compile(r'onConvertBegin\(\'(?P<file_id>[^\']+)\', (?P<target_type>[^\)]+).*')

MAX_FIELD_LENGTH = 1000

_min_weight = 0

_weight_multiplier = 0


def safe_print(data):
    if not data:
        return
    for field in data.values():
        if not field or len(str(field)) > MAX_FIELD_LENGTH:
            return
    print(json.dumps(data))


def do_decode(lines):
    for line in lines:
        yield json.loads(line)


def get_data_from_events_log(row):
    keys = row.keys()
    if 'action' not in keys or 'file_id' not in keys or 'target_type' not in keys:
        return None
    action = row.get('action')
    file_id = get_file_id(row.get('file_id'), row.get('target_type'))
    if action == 'DOCUMENT_DELETED':
        return {'file_id' : file_id, 'deleted' : True}
    if action == 'DOCUMENT_SKIPPED' or action == 'DOCUMENT_ACCESSED':
        return get_last_access_updated_document_data(row, file_id)
    return None


def get_last_access_updated_document_data(row, file_id):
    keys = row.keys()
    if 'last_access_date' not in keys:
        return None
    last_access_line = row.get('last_access_date')
    if not last_access_line:
        return None
    try:
        last_access_date = datetime.strptime(last_access_line, '%Y-%m-%d')
    except Exception:
        return None
    weight = int(row.get('weight')) if 'weight' in keys else None
    if weight and weight > _min_weight:
        last_access_date = last_access_date + timedelta(days=int(round(weight * _weight_multiplier)))
    return {'file_id' : file_id, 'last_access_day' : last_access_date.strftime('%Y-%m-%d')}


def get_date(date_str):
    # The date example is '2020-05-13 19:33:29'
    if not iso_eventtime_regexp.match(date_str):
        return None
    return date_str[:10]


def get_data_from_regular_tskv_log(row):
    keys = row.keys()
    if 'message' not in keys or 'iso_eventtime' not in keys:
        return None
    message = row.get('message')
    match = file_with_hash_regexp.match(message) or on_convert_begin_regexp.match(message)
    if match:
        last_access_day = get_date(row.get('iso_eventtime'))
        return {'file_id' : get_file_id(match.group('file_id'), match.group('target_type')), 'last_access_day' : last_access_day} if last_access_day else None
    return None


def get_file_id(file_id, target_type):
    return '{} {}'.format(file_id, target_type)


def do_map(rows):
    for row in rows:
        keys = row.keys()
        if 'tskv_format' not in keys and 'file_id' in keys and 'last_access_day' in keys:
            safe_print({'file_id' : row.get('file_id'), 'last_access_day' : row.get('last_access_day')})
            continue
        tskv_format = row.get('tskv_format')
        if tskv_format == 'ydisk-docviewer-web-events-log':
            safe_print(get_data_from_events_log(row))
            continue
        if tskv_format == 'ydisk-web-log':
            safe_print(get_data_from_regular_tskv_log(row))
            continue


def do_reduce(rows):
    prev_id = None
    last_access_day = ''
    deleted = False
    for row in itertools.chain(rows, ({},)):
        row_id = row.get('file_id')
        if prev_id and prev_id != row_id:
            safe_print(None if deleted else {'file_id' : prev_id, 'last_access_day' : last_access_day})
            last_access_day = ''
            deleted = False

        if row:
            last_access_day_to_check = row.get('last_access_day')
            last_access_day = last_access_day_to_check if last_access_day_to_check and last_access_day_to_check > last_access_day else last_access_day
            if 'deleted' in row.keys() and row.get('deleted'):
                deleted = True

        prev_id = row_id


def main():
    stream = sys.stdin
    stream = do_decode(stream)
    actions = {'map': do_map, 'reduce': do_reduce}
    if len(sys.argv) < 2 or sys.argv[1] not in actions:
        print('Please, specify proper map or reduce command to execute', file=sys.stderr)
        sys.exit(1)

    if len(sys.argv) >= 4:
        global _min_weight, _weight_multiplier
        _min_weight = int(sys.argv[2])
        _weight_multiplier = float(sys.argv[3])

    actions[sys.argv[1]](stream)


if __name__ == '__main__':
    main()
