use hahn;

pragma yt.PoolTrees = 'physical';
pragma yt.TentativePoolTrees = 'cloud';

declare $input_path as String;
declare $output_path as String;
declare $yt_pool as String?;
declare $tmp_folder as String?;
declare $query_column as String;
declare $region_column as String;
declare $new_query_to_cluster_folder as String?;

$yt_pool = $yt_pool ?? "default";
pragma yt.Pool = $yt_pool;
$tmp_folder = $tmp_folder ?? "//tmp";
pragma yt.TmpFolder = $tmp_folder;

-- Look for query_frankenstein_clustering_optimized_model.dssm in archive https://a.yandex-team.ru/arc/trunk/arcadia/search/wizard/data/wizard/QueryFrankensteinClustering/ya.make#L9
pragma File("model", "https://proxy.sandbox.yandex-team.ru/1941102488");

pragma library("common.sql");
pragma library("define_cluster_lib.sql");

import define_cluster_lib symbols $get_cluster_id, $get_query, $get_region, $normalize_dopp, $get_new_query_path;

$raw_queries = (
    select *
    from (
        select
            ForceRemoveMembers(TableRow(), ["query"]),
            $get_query(TableRow(), $query_column) as query
        from $input_path
    )
    flatten columns
);

$queries = (
    select
        q.*,
        c.ClusterID as ClusterID,
        c.ClusterID is null as _is_new_cluster,
    from any $raw_queries as q
    left join `//home/searchshare/common/query_to_cluster_history/query_to_cluster_classifier` as c
    using (query)
    where q.query is not null
);

insert into @queries_with_qfuf_top_10
select
    source_table.*,
    source_queries_exp_top10.Ext_Qfuf_Top10 as _Ext_Qfuf_Top10
from $queries as source_table
left join any `//home/searchshare/common/source_queries_exp_top10` as source_queries_exp_top10
on source_queries_exp_top10.Qdopp = $normalize_dopp(source_table.query)
order by query;

commit;

pragma yt.DefaultMemoryLimit = "5G";
pragma yt.DataSizePerJob = '32M';

$query_with_cluster = (
    select
        source_table.*,
        COALESCE(ClusterID, $get_cluster_id(query, _Ext_Qfuf_Top10, $get_region(TableRow(), $region_column))) as ClusterID,
    without ClusterID
    from @queries_with_qfuf_top_10 as source_table
);

insert into $output_path with truncate
select *
without _Ext_Qfuf_Top10, _is_new_cluster
from $query_with_cluster
order by query;

evaluate if $new_query_to_cluster_folder is not null do
begin
    $new_query_path = $get_new_query_path($new_query_to_cluster_folder, "search");
    insert into $new_query_path
    select query, ClusterID
    from $query_with_cluster
    where _is_new_cluster
    order by query;
end do;
