-- https://a.yandex-team.ru/arcadia/products/analytics/charts/category_coverage

USE hahn;
PRAGMA yt.StaticPool = "goods_quality";
PRAGMA yt.InferSchema = '1';
PRAGMA SimpleColumns;

$date_start = '2022-05-01';

/*
$date = '${global.date}';
$outputdir = '${global.outputdir_eval}';
--*/

--/*
$date = '2022-07-12';
$outputdir = '//home/goods_quality/charts/category_coverage/eval';
--*/

/* ======================================================================== */

$cachedir = $outputdir || '/cache';
$output_merged = $outputdir || '/category_coverage_merged';
$output_merged_sorted = $outputdir || '/category_coverage_merged_sorted';

/* ======================================================================== */

$script = @@#py
import datetime

def get_dates_between(date_start, date_end, date_delta=1):
    start = datetime.datetime.fromisoformat(date_start)
    end = datetime.datetime.fromisoformat(date_end)
    dates = [start + datetime.timedelta(days=x) for x in range(0, (end - start).days + 1, date_delta)]
    return [d.strftime('%Y-%m-%d') for d in dates]

def udf_get_dates_between(rows):
    for row in rows:
        date_start = row.date_start.decode('ascii')
        date_end = row.date_end.decode('ascii')
        release_dates = sorted([s.decode('ascii') for s in row.release_dates]) + ['2022-00-00']

        dates = get_dates_between(date_start, date_end)
        
        last_release_i = -1
        for date in dates:
            if date == release_dates[last_release_i + 1]:
                last_release_i += 1
            yield {
                "date": date, 
                "date_last_release": release_dates[last_release_i]
            }
@@;

$udf_get_dates_between = Python3::udf_get_dates_between(
Callable<
    (List<Struct<
        date_start:String,
        date_end:String,
        release_dates:Stream<String>?
    >>)->
    Stream<Struct<
        `date`:String,
        `date_last_release`:String
    >>
>, $script);

/* ======================================================================== */

$coverage_by_release = select * from range($cachedir);
$release_dates = select aggregate_list(`date`) from $coverage_by_release;

$dates_start_end = select 
    $date_start as date_start, 
    $date as date_end, 
    $release_dates as release_dates;
$dates = process $dates_start_end using $udf_get_dates_between(TableRows());

$dates_joined_coverage = (
select 
  dates.*,
  coverage.*
  without coverage.`date`,
from $dates as dates
left join $coverage_by_release as coverage
on dates.date_last_release == coverage.`date`
);

insert into $output_merged with truncate
select * from $dates_joined_coverage;

insert into $output_merged_sorted with truncate
select t.*, (`date` || 'T23:59:59') as `timestamp`
from $dates_joined_coverage as t
order by `timestamp` asc;

