# -* encoding: utf8 -*-
from common import *

import sys
import json
import time
import argparse


ts_to_num = {}
num_to_ts = {}

def get_key(rec):
    return (rec["url"], rec["promo_type"], rec["promo_position"])

def get_time_tuple(rec):
    return (rec["record_ts"], rec["tm_start"], rec["tm_end"])

def parse_periods(doc_beats, cnt_beats_total):
    doc_beats.sort(key=lambda x: x[0])
    if len(doc_beats) == 0:
        return []

    period_ids = []

    prev_id = ts_to_num[doc_beats[0][0]]
    begin_i = 0
    for i, beat in enumerate(doc_beats[1:]):
        beat_id = ts_to_num[beat[0]]
        if beat_id != prev_id + 1:
            period_ids.append((begin_i, i-1))
            begin_i = i

        prev_id = beat_id
    period_ids.append((begin_i, len(doc_beats)-1))

    periods = []
    for period in period_ids:
        start = max(
            doc_beats[period[0]][0],
            doc_beats[period[0]][1]
        )

        last_beat_ts = doc_beats[period[1]][0]
        end_time = doc_beats[period[1]][2]

        last_beat_id = ts_to_num[last_beat_ts]
        if last_beat_id != cnt_beats_total: # если после бита есть следующие, то период промотирования закончился
            end = min(
                num_to_ts[last_beat_id+1],
                end_time
             )
        else: # последний бит из всех, промотироание ещё идет
            end = last_beat_ts

        periods.append((start, end))

    return periods

def main():
    print(sys.argv)
    parser = argparse.ArgumentParser()
    parser.add_argument('--raw_heartbeats', type=str, required=True)
    parser.add_argument('--aggregated_logs', type=str, required=True)

    args = parser.parse_args()
    cluster = clusters.yt.Hahn().env(parallel_operations_limit=10,
                                     yt_spec_defaults=dict(
                                         pool_trees=["physical"],
                                         tentative_pool_trees=["cloud"]
                                     ),
                                     templates=dict(
                                         tmp_root='//tmp',
                                         title='BloggerNewContentRecommendations'
                                     ))


    beats_ts_set = set()

    beats_container = {}
    for rec in cluster.driver.read(args.raw_heartbeats):
        beats_ts_set.add(rec["record_ts"])

        key = get_key(rec)
        val = get_time_tuple(rec)
        if key not in beats_container:
            beats_container[key] = []
        beats_container[key].append(val)

    sorted_beats = list(beats_ts_set)
    sorted_beats.sort()

    for i, ts in enumerate(sorted_beats):
        ts_to_num[ts] = i
        num_to_ts[i] = ts
    cnt_beats_total = len(sorted_beats)-1

    records = []
    for k in beats_container:
        doc_beats = beats_container[k]
        periods = parse_periods(doc_beats, cnt_beats_total)
        for period in periods:
            records.append(Record(url=k[0], promo_type=k[1], promo_position=k[2], tm_start=period[0], tm_end=period[1]))

    print(records)
    cluster.driver.write(args.aggregated_logs, records)

if __name__ == '__main__':
    main()

