use hahn;

$table_name = (
    SELECT Max(TableName(Path, 'yt')) as tName FROM FOLDER(`//logs/yt-structured-scheduler-log/1d`)
);

$last_report_table_name = (
    SELECT Max(TableName(Path, 'yt')) as tName FROM FOLDER(`//home/webmaster/prod/analytics/consumption_yt`)
);

$source_table_path = '//logs/yt-structured-scheduler-log/1d/' || $table_name;
$job_info_source_table_path = '//logs/yt-scheduling-event-log/1d/' || $table_name;
$output_table_path = '//home/webmaster/prod/analytics/consumption_yt/' || $table_name;

$byte_to_gb = ($bytes) -> {
    return cast($bytes as Double) / 1024 / 1024 / 1024;
};

$parse_output_in_data_statistics = ($output, $var_name) -> {
    return ListSum(ListMap(
        ListFlatten(ListMap(
            DictPayloads(Yson::ConvertToDict($output)),
            ($x) -> {return  Yson::ConvertToList($x[$var_name])})
        ),
        ($x) -> {return Yson::ConvertToInt64($x['summary']['sum'])}
    ));
};


$parse_chunk_count = ($output, $var_name) -> {
    return ListSum(
        ListMap(
            DictPayloads(Yson::ConvertToDict($output)),
            ($x) -> {return Yson::ConvertToInt64($x[$var_name]['sum'])})
        );
};

$get_erasure_disk_space_usage = ($output) -> {
    return $byte_to_gb($parse_output_in_data_statistics($output, 'erasure_disk_space'));
};

$get_regular_disk_space_usage = ($output) -> {
    return $byte_to_gb($parse_output_in_data_statistics($output, 'regular_disk_space'));
};

$get_input_tables = ($paths) -> {
    return ListMap(Yson::ConvertToList($paths), ($x) -> {return nvl($x['$attributes']['original_path'], $x['$value'], $x);});
};

$datetime_parser = ($x) -> {
    return DateTime::MakeDatetime(DateTime::Parse('%Y-%m-%dT%H:%M:%SZ')($x));
};

$part_data =
select *
from $source_table_path
where authenticated_user = 'robot-webmaster'
    and event_type = 'operation_completed';

$tmp_job_data =
select operation_id,
    statistics['data']['input']['chunk_count']['sum'] as input_chunk_count,
    statistics['data']['input']['erasure_disk_space']['sum'] as input_erasure_disk_space,
    $parse_chunk_count(statistics['data']['output'], 'chunk_count') as output_chunk_count,
    node_address
from $part_data as pd
right semi join (select * from $job_info_source_table_path) as jd
on jd.operation_id = pd.operation_id
where event_type = 'job_completed';

$job_data =
select operation_id,
    sum(Yson::ConvertToInt64(input_chunk_count)) as input_chunks,
    sum(Yson::ConvertToInt64(input_erasure_disk_space)) as input_erasure_disk_space,
    sum(output_chunk_count) as output_chunks,
    count(distinct node_address) as nodes
from $tmp_job_data
group by operation_id;

$started_by_command_extract = ($spec_start_cmd, $spec_yql_op) -> {
    return nvl($spec_yql_op, String::JoinFromList(Yson::ConvertToStringList($spec_start_cmd), ' '));
};

define action $make_yt_operations_report() as
    insert into $output_table_path with truncate
    select pd.cluster as cluster,
        pd.operation_type as operation_type,
        pd.operation_id as operation_id,
        pd.start_time as start_time,
        $get_input_tables(pd.spec['input_table_paths']) as input_tables,
        $get_erasure_disk_space_usage(pd.progress['job_statistics_v2']['data']['output']) as output_erasure_disk_space_usage_gb,
        $get_regular_disk_space_usage(pd.progress['job_statistics_v2']['data']['output']) as output_regular_disk_space_usage_gb,
        Datetime::ToSeconds($datetime_parser(pd.finish_time)) -
            Datetime::ToSeconds($datetime_parser(pd.start_time)) as duration_s,
        Yson::ConvertToDouble(pd.accumulated_resource_usage_per_tree['physical']['cpu']) as cpu,
        $byte_to_gb(Yson::ConvertToDouble(pd.accumulated_resource_usage_per_tree['physical']['user_memory'])) as ram_gb,
        Yson::ConvertToInt64(pd.progress['total_job_counter']['total']) as job_count,
        $started_by_command_extract(pd.spec['started_by']['command'], Yson::ConvertToString(pd.spec['description']['yql_op_url']['$value'])) as start_command,
        Yson::ConvertToInt64(pd.spec['started_by']['pid']) as pid,
        jd.input_chunks as input_chunks,
        jd.output_chunks as output_chunks,
        $byte_to_gb(jd.input_erasure_disk_space) as input_erasure_disk_space,
        jd.nodes as nodes,
    from $part_data as pd
    inner join $job_data as jd
    on pd.operation_id = jd.operation_id
    order by cluster,
        duration_s desc
end define;

evaluate if $table_name > $last_report_table_name  or $last_report_table_name is null
    do $make_yt_operations_report();