use hahn;

pragma yt.PoolTrees = 'physical';
pragma yt.TentativePoolTrees = 'cloud';

DECLARE $start_date AS String;
DECLARE $end_date AS String;
DECLARE $yt_pool AS String;
DECLARE $squeeze_path AS String;
DECLARE $dst_path AS String?;
declare $query_to_cluster_path as String;

PRAGMA yt.Pool = $yt_pool;

-- Look for query_frankenstein_clustering_optimized_model.dssm in archive https://a.yandex-team.ru/arc/trunk/arcadia/search/wizard/data/wizard/QueryFrankensteinClustering/ya.make#L9
pragma File("model", "https://proxy.sandbox.yandex-team.ru/1941102488");

pragma library("common.sql");
pragma library("define_cluster_lib.sql");

import common symbols $normalize_query;
import define_cluster_lib symbols $get_cluster_id, $normalize_dopp;

$similar_path = $squeeze_path || "/similar";
$spylog_path = $squeeze_path || "/spylog";

$region_by_id = ($geo_id) -> (Geo::RoundRegionById(Cast($geo_id AS int32), "country"));

$raw_queries = (
    select
        norm_query as query,
        Cast(user_region as int32) as user_region,
        $normalize_dopp(norm_query) as Qdopp,
    from range($similar_path, $start_date, $end_date)
    union all
    select
        norm_query as query,
        Cast(user_region as int32) as user_region,
        $normalize_dopp(norm_query) as Qdopp,
    from range($spylog_path, $start_date, $end_date)
    union all
    select
        $normalize_query(normal_query) AS query,
        $region_by_id(geo_id).id AS user_region,
        $normalize_dopp($normalize_query(normal_query)) as Qdopp,
    from range("//statbox/cube/daily/request/v4", $start_date, $end_date)
    where
        is_good_search
        and not is_robot
        and $region_by_id(geo_id).en_name == "Russia"
);

$queries = (
    select q.*
    from any $raw_queries as q
    left only join $query_to_cluster_path as c
    using (query)
    where q.query is not null
);

insert into @queries_with_qfuf_top_10
SELECT
    source_table.*,
    source_queries_exp_top10.Ext_Qfuf_Top10 as Ext_Qfuf_Top10
FROM $queries as source_table
LEFT JOIN any `//home/searchshare/common/source_queries_exp_top10` as source_queries_exp_top10
using (Qdopp)
order by query;

commit;

pragma yt.DefaultMemoryLimit = "5G";
pragma yt.DataSizePerJob = '32M';

evaluate if $dst_path is null do begin
    insert into $query_to_cluster_path
    select
        query,
        unwrap(cast($get_cluster_id(query, Ext_Qfuf_Top10, user_region) as int64?)) as ClusterID
    from @queries_with_qfuf_top_10 as source_table
    order by query;
end do
else do begin
    insert into $dst_path with truncate
    select
        query,
        $get_cluster_id(query, Ext_Qfuf_Top10, user_region) as ClusterID
    from @queries_with_qfuf_top_10 as source_table
    order by query;
end do;
