USE hahn;

DECLARE $yt_pool AS String;
DECLARE $start_date AS String;
DECLARE $end_date AS String;

PRAGMA yt.DefaultMaxJobFails = '1';
PRAGMA yt.TemporaryAutoMerge = 'disabled';
PRAGMA yt.Pool = $yt_pool;


$output_table = "//home/searchshare/dashboard/yandex_google_datasets/flow_aggregation_raw";


$source = (
    SELECT
        day,
        q_cluster AS cluster,
        src AS source,
        String::RemoveAll(enter_type, " ") AS enter_type, 
        ui,
        TryMember(TableRow(), "C2_prob", NULL) AS C2_prob,
        CAST(TryMember(TableRow(), "cluster", NULL) AS Int32) AS prism_cluster,
        norm_query
    FROM (
        SELECT d.*, CAST(TableName() AS Date) AS day
        FROM RANGE("//home/searchshare/squeeze/spylog", $start_date, $end_date) AS d
        WHERE ListLength(query_list) > 0
        UNION ALL
        SELECT d.*, CAST(TableName() AS Date) AS day
        FROM RANGE("//home/searchshare/squeeze/similar", $start_date, $end_date) AS d
        WHERE ListLength(query_list) > 0
    )
);


$agg_clusters = (
    SELECT
        day,
        source,
        ui,
        enter_type,
        cluster,
        COUNT(*) AS queries,
        1.0 * SUM(C2_prob) AS queries_C2,
        COUNT_IF(prism_cluster >= 99) AS queries_99,
        COUNT_IF(prism_cluster >= 90) AS queries_90,
        COUNT_IF(C2_prob IS NOT NULL) AS queries_with_C2_prob,
        COUNT_IF(prism_cluster IS NOT NULL) AS queries_with_prism_cluster
    FROM $source
    GROUP BY day, source, ui, enter_type, cluster
);


$agg_clusters = SELECT * FROM $agg_clusters AS r
LEFT JOIN ANY (
    SELECT cluster_id AS cluster, cluster_name, middle_cluster_name, parent_cluster_name
    FROM `//home/searchshare/dashboard/cluster_tree`
) AS t
USING(cluster);


$old_data = (
    SELECT * FROM $output_table
    WHERE CAST(day AS String) NOT BETWEEN $start_date AND $end_date
);

INSERT INTO $output_table WITH TRUNCATE  
SELECT * FROM $agg_clusters
UNION ALL
SELECT * FROM $old_data;
