PRAGMA yt.InferSchema;
PRAGMA yt.IgnoreWeakSchema;
-- ========================================================================= --
PRAGMA yt.MapJoinLimit = '4096M';
PRAGMA yt.MapJoinShardCount = '10';
PRAGMA yt.DataSizePerJob = '1G';
-- ========================================================================== --
$TOPFREQ_BUFF = 250;
$GEO_HASH_LVL = 7;
$MAX_CRYPTA_ID_ON_IP = 10;
$MAX_CRYPTA_ID_ON_GEO = 1000;
$MAX_GEO_PER_CRYPTA_ID = 15;
$MIN_DAYS_GEO = 5;
$MIN_DAYS_ONLY_IP = 15;
$MONSTER_CCID = 750;
-- ========================================================================== --

$homework_uid_tbl = '//home/user_identification/homework/v2/prod/homework_unified_id';
-- $matching_table = '{{ graph_output_dir }}/v2/matching/vertices_no_multi_profile_by_id_type';
$matching_table = '{{ graph_output_dir }}/v2/export/ActiveIdentifiers';
$hh_storage = '{{ household_dir }}/storage';
$work_dir = '{{ household_dir }}/workdir';
$crypta_id_yuid_ip_table = '{{ household_dir }}/output/crypta_id_ip';

$crypta_id_yuid_tbl = $work_dir || '/crypta_id_yuid';
$all_yuids_tbl = $work_dir || '/all_yuids';

$homework_cid = (
    SELECT
        unified_id AS crypta_id,
        predicted_home
    FROM $homework_uid_tbl
    WHERE source_unified_id == "crypta_id"
);

$homework_yid = (
    SELECT
        unified_id AS yandexuid,
        predicted_home
    FROM $homework_uid_tbl
    WHERE source_unified_id == "yandexuid"
);

$dt = '{{ date }}';
$from = '{{
    datetime.datetime.strftime(
        datetime.datetime.strptime(date, "%Y-%m-%d")
            - datetime.timedelta(days=30),
        "%Y-%m-%d"
    )
}}'; -- 30D

$exp_range = (
    SELECT storage.*
    FROM RANGE($hh_storage) AS storage
);

DEFINE ACTION $publish_ip_tables() AS
    $households_to_ip = (
        SELECT yuid, ip, dt, hits
        FROM $exp_range
    );

    $crypta_to_yuid = (
        SELECT
            CAST(cryptaId AS String) AS crypta_id,
            CAST(id AS UInt64) AS yuid
        FROM $matching_table
        WHERE id_type == 'yandexuid'
    );

    INSERT INTO $crypta_id_yuid_ip_table WITH TRUNCATE
    SELECT
        crypta_id,
        yuid,
        ip,
        AGGREGATE_LIST_DISTINCT(dt) AS dates,
        SUM(hits) AS total_hits
    FROM (
        SELECT
            matching.crypta_id AS crypta_id,
            (matching.yuid ?? households.yuid) AS yuid,
            households.ip AS ip,
            CAST(households.dt AS Date) AS dt,
            households.hits AS hits
        FROM $households_to_ip AS households
        LEFT JOIN $crypta_to_yuid AS matching
        USING (yuid)
    ) GROUP BY crypta_id, yuid, ip
    ORDER BY crypta_id, yuid, ip;
END DEFINE;

DEFINE ACTION $prepare_crypta_id_yuid() AS

    INSERT INTO @yuid_geohash WITH TRUNCATE
        -- yuid -- geohash
        SELECT yuid, GeoHash::Encode(
            home['latitude'], home['longitude'], $GEO_HASH_LVL) AS home_hash  -- , home
        FROM (
            SELECT cid.yuid AS yuid, Yson::ConvertToDoubleDict(hw.predicted_home) AS home
            FROM $homework_cid AS hw
            LEFT JOIN (
                SELECT
                    CAST(cryptaId AS String) AS crypta_id,
                    id AS yuid
                FROM $matching_table
                WHERE id_type == 'yandexuid'
            ) AS cid
            USING (crypta_id)
        ) WHERE yuid IS NOT NULL
            AND ListLength(DictKeys(home)) > 0
        ORDER BY yuid;

    $non_monster_ccid = (
        SELECT crypta_id
        FROM $matching_table
        WHERE id_type == 'yandexuid'
        GROUP BY CAST(cryptaId AS String) AS crypta_id
        HAVING COUNT(1) < $MONSTER_CCID
    );

    INSERT INTO $crypta_id_yuid_tbl WITH TRUNCATE
    SELECT crypta_id, yuid
    FROM (
        SELECT
            CAST(cryptaId AS String) AS crypta_id,
            id AS yuid
        FROM $matching_table
        WHERE id_type == 'yandexuid'
    ) AS matching
    LEFT SEMI JOIN $non_monster_ccid AS white
    USING (crypta_id)
    ORDER BY crypta_id, yuid;

    INSERT INTO $all_yuids_tbl WITH TRUNCATE
    SELECT yuid, MAX(is_tv) AS is_tv
    FROM $exp_range
    GROUP BY CAST(yuid AS String) AS yuid
    ORDER BY yuid;

    INSERT INTO @yuid_ip_dt WITH TRUNCATE
        -- yuid -- ip, dt, hits (is_tv)
        SELECT
            CAST(yuid AS String) AS yuid,
            ip, dt, hits, is_tv
        FROM $exp_range
        WHERE yuid != 0
        ORDER BY yuid;

END DEFINE;

-- ========================================================================== --

DEFINE ACTION $prepare_crypta_id_geo() AS
    $crypta_id_geo_subq = (
        SELECT
            crypta_id, tpfr_geo[0].Value AS geo, tpfr_geo
        FROM (
            SELECT
                crypta_id, TopFreq(geo, $TOPFREQ_BUFF) AS tpfr_geo
            FROM (
                SELECT crypta.crypta_id AS crypta_id, geo.home_hash AS geo
                FROM $crypta_id_yuid_tbl AS crypta
                INNER JOIN @yuid_geohash AS geo
                USING (yuid)
            ) GROUP BY crypta_id
        ) WHERE ListLength(tpfr_geo) < $MAX_GEO_PER_CRYPTA_ID
    );

    $whitelist_geo = (
        SELECT geo
        FROM $crypta_id_geo_subq
        GROUP BY geo
        HAVING COUNT(crypta_id) < $MAX_CRYPTA_ID_ON_GEO
    );

    $crypta_id_geo = (
        SELECT
            crypta.geo AS geo,
            crypta.crypta_id AS crypta_id
        FROM $crypta_id_geo_subq AS crypta
        LEFT SEMI JOIN $whitelist_geo AS whitelist
        USING (geo)
    );

    INSERT INTO @crypta_id_geo WITH TRUNCATE
    SELECT * FROM $crypta_id_geo
    ORDER BY geo;
END DEFINE;

-- ========================================================================== --

DEFINE ACTION $prepare_crypta_id_ip() AS
    -- Join crypta_id with (ip, dt) and filter popular ips
    $crypta_id_ip_dt_subq = (
        SELECT
            crypta_id, ip, dt, SUM(hits) AS hits
        FROM (
            SELECT
                crypta.crypta_id AS crypta_id,
                ip_dt.ip AS ip,
                ip_dt.dt AS dt,
                ip_dt.hits AS hits
            FROM $crypta_id_yuid_tbl AS crypta
            INNER JOIN @yuid_ip_dt AS ip_dt
            USING (yuid)
        ) GROUP BY crypta_id, ip, dt
    );

    $whitelist_ip_dt = (
        SELECT ip, dt
        FROM $crypta_id_ip_dt_subq
        GROUP BY ip, dt
        HAVING
            COUNT(1) < $MAX_CRYPTA_ID_ON_IP  -- filter collective ips
            AND COUNT(1) > 1  -- filter only once
    );

    $crypta_id_ip_dt = (
        SELECT
            crypta.crypta_id AS crypta_id,
            crypta.ip AS ip, crypta.dt AS dt,
            crypta.hits AS hits
        FROM $crypta_id_ip_dt_subq AS crypta
        LEFT SEMI JOIN $whitelist_ip_dt AS whitelist
        USING (ip, dt)
    );

    INSERT INTO @crypta_id_ip_dt WITH TRUNCATE
    SELECT * FROM $crypta_id_ip_dt
    ORDER BY ip, dt;
END DEFINE;

-- ========================================================================== --

DEFINE SUBQUERY $join_by_geo() AS
    SELECT
        t1.crypta_id AS id1,
        t2.crypta_id AS id2,
        NULL AS ip,
        NULL AS dt,
        t1.geo AS geo,
        NULL AS ssid
    FROM @crypta_id_geo AS t1
    INNER JOIN @crypta_id_geo AS t2
    USING (geo)
    -- filter duplicates
    WHERE (t1.crypta_id < t2.crypta_id)
    ORDER BY id1, id2;
END DEFINE;

DEFINE SUBQUERY $join_by_ip_dt() AS
    SELECT
        t1.crypta_id AS id1,
        t2.crypta_id AS id2,
        t1.ip AS ip,
        t1.dt AS dt,
        NULL AS geo,
        NULL AS ssid
    FROM @crypta_id_ip_dt AS t1
    INNER JOIN @crypta_id_ip_dt AS t2
    USING (ip, dt)
    -- filter duplicates
    WHERE (t1.crypta_id < t2.crypta_id)
    ORDER BY id1, id2;
END DEFINE;

DEFINE ACTION $crypta_id_with_crypta_id() AS
    $tbl = $work_dir || '/crypta_id_crypta_id';

    INSERT INTO $tbl WITH TRUNCATE
    SELECT id1, id2
    FROM (
        SELECT id1, id2, ip, dt, geo, ssid FROM $join_by_geo()
        UNION ALL
        SELECT id1, id2, ip, dt, geo, ssid FROM $join_by_ip_dt()
    ) GROUP BY
        id1 AS id1,
        id2 AS id2
    HAVING (
        -- equal geo and 5+ days ip
        (SOME(geo) IS NOT NULL AND COUNT(DISTINCT dt) >= $MIN_DAYS_GEO)
        OR -- equal ip 15+ days
        (SOME(geo) IS NULL AND COUNT(DISTINCT dt) >= $MIN_DAYS_ONLY_IP)
    )
    ORDER BY id1, id2;
END DEFINE;

DEFINE ACTION $homeless_tvs() AS
    $tbl = $work_dir || '/homeless_tvs';

    $homeless_tv_whitelist = (
        SELECT tvs.yuid AS yuid
        FROM $all_yuids_tbl AS tvs
        LEFT ONLY JOIN $crypta_id_yuid_tbl AS crypta
        USING (yuid)
        WHERE tvs.is_tv
    );

    $tv_ip_dt = (
        SELECT tv.yuid AS yuid, tv.ip AS ip, tv.dt AS dt, tv.hits AS hits
        FROM @yuid_ip_dt AS tv
        LEFT SEMI JOIN $homeless_tv_whitelist AS whitelist
        USING (yuid)
    );

    $joined_tv_with_ccid = (
        SELECT
            tv_ipdt.yuid AS tv,
            tv_ipdt.ip AS ip,
            tv_ipdt.dt AS dt,
            tv_ipdt.hits AS tv_hits,
            ccid_ipdt.crypta_id AS ccid,
            ccid_ipdt.hits AS ccid_hits
        FROM $tv_ip_dt AS tv_ipdt
        INNER JOIN @crypta_id_ip_dt AS ccid_ipdt
        USING (ip, dt)
    );

    $ccid_for_tv = (
        SELECT
            tv,
            MAX_BY(ccid, day_activity) AS ccid,
            SUM(tv_hits) AS hits
        FROM (
            SELECT
                tv,
                ccid,
                AsTuple(COUNT(1), SUM(ccid_hits)) AS day_activity,
                SUM(tv_hits) AS tv_hits
            FROM $joined_tv_with_ccid
            GROUP BY tv, ccid
        ) GROUP BY tv
    );

    INSERT INTO $tbl WITH TRUNCATE
    SELECT tv AS yuid, ccid, hits
    FROM $ccid_for_tv
    ORDER BY ccid, yuid;
END DEFINE;
-- ========================================================================== --

-- materialize temporary tables for optimize JOIN - from MapReduce to JoinReduce
DO $prepare_crypta_id_yuid();
COMMIT;

DO $prepare_crypta_id_geo();
DO $prepare_crypta_id_ip();
COMMIT;

DO $homeless_tvs();
DO $publish_ip_tables();

PRAGMA yt.DataSizePerJob = '256M';
DO $crypta_id_with_crypta_id();
