$soup_dir = '{{ soup_dir }}';
$ids_dir = '{{ ids_dir }}';
$out_ids_dir = '{{ out_ids_dir }}';
$out_soup_dir = '{{ out_soup_dir }}';

-- Getting unique ids from soup
$raw_logins = $ids_dir || '/login';
$raw_emails = $ids_dir || '/email';
$raw_phones = $ids_dir || '/phone';
$raw_avito_hashes = $ids_dir || '/avito_hash';

-- Generation new artificial edges
-- 1. Login to email
$login_to_email = ($login) -> {
    $ok_login = Identifiers::IsValidLogin($login);
    $normed_login = Identifiers::NormalizeLogin($login);
    RETURN CASE
        WHEN NOT $ok_login THEN NULL
        -- CRYPTAIS-1582: synthetic phone and social auth
        WHEN String::Contains($normed_login, 'phne-') OR String::Contains($normed_login, 'uid-') THEN NULL
        -- CRYPTR-223 pdd
        WHEN Identifiers::IsValidEmail($normed_login) THEN $normed_login
        WHEN Identifiers::IsValidEmail($normed_login || '@yandex.ru') THEN $normed_login || '@yandex.ru'
        ELSE NULL
    END;
};

$emails_from_logins = (
    select
        id as login,
        $login_to_email(id) as email
    from $raw_logins
    where $login_to_email(id) is not NULL
);

$final_emails = (
select distinct email from (
    select id as email from $raw_emails
    union all
    select email from $emails_from_logins
));

INSERT INTO @final_emails
SELECT * FROM $final_emails;

-- hash and denorm are heavy ops
PRAGMA yt.DataSizePerJob = '50M';
PRAGMA yt.DataSizePerMapJob = '50M';
PRAGMA yt.DataSizePerPartition = '50M';
COMMIT ;  -- commit and tmp table is used to apply pragmas

-- 2. Email to phone
$email_to_phone_inner = ($email) -> {
    $login_part = String::SplitToList($email, '@')[0];
    return CASE
        WHEN $login_part is NULL THEN NULL
        WHEN Identifiers::IsValidPhone($login_part)
            THEN Identifiers::NormalizePhone($login_part)
        ELSE NULL
    END;
};

$email_to_phone = ($email) -> {
    $ok_email = Identifiers::IsValidLogin($email);
    $normed_email = Identifiers::NormalizeLogin($email);
    return
    CASE
        WHEN NOT $ok_email THEN NULL
        -- CRYPTR-703
        WHEN NOT String::Contains($normed_email, '@') THEN NULL
        ELSE $email_to_phone_inner($normed_email)
    END;
};

$phones_from_emails = (
    select
        email,
        $email_to_phone(email) as phone
    from @final_emails
    where $email_to_phone(email) IS NOT NULL
);

$final_phones = (
select distinct phone from (
    select id as phone from $raw_phones
    union all
    select phone from $phones_from_emails
));

INSERT INTO @final_phones
select * from $final_phones;

COMMIT ;

$sha256_phone_with_plus = ($value) -> {
    $plus_phone = Identifiers::NormalizePhone($value);
    RETURN String::AsciiToLower(String::HexEncode(Digest::Sha256($plus_phone)));
};

-- 3. hashes
$email_md5_sha256 = (
    SELECT
        email,
        Identifiers::HashMd5Email(email) AS email_md5,
        Identifiers::HashSha256Email(email) AS email_sha256
    FROM @final_emails
);

$phone_md5_sha256 = (
    SELECT
        phone,
        Identifiers::HashMd5Phone(phone) as phone_md5,
        Identifiers::HashSha256Phone(phone) as phone_sha256
    FROM @final_phones

    UNION all

    SELECT
        phone,
        Identifiers::HashMd5Phone(phone) as phone_md5,
        $sha256_phone_with_plus(phone) AS phone_sha256
    FROM @final_phones
);

-- 4. Avito
$avito_hash = ($orig_value) -> {
    return Digest::Md5Hex($orig_value || '{{avito_salt}}');
};

$denorm_email_inner = ($login_part, $domain_part) -> {
    $login_part_normed = String::ReplaceAll($login_part, '-', '.');
    return IF($domain_part == 'yandex.ru',
        AsList(
            $login_part_normed || '@yandex.ru',
            $login_part_normed || '@ya.ru',
            $login_part_normed || '@yandex.by',
            $login_part_normed || '@yandex.ua',
            $login_part_normed || '@yandex.kz',
            $login_part_normed || '@yandex.com',
            $login_part_normed || '@yandex.com.tr'
        ),
        AsList($login_part || '@' || $domain_part)
    );
};

$denorm_email = ($email) -> {
    $parts = String::SplitToList($email, '@');
    return IF(ListLength($parts) == 2,
        $denorm_email_inner($parts[0], $parts[1]),
        ListCreate(String)
    );
};

$all_avitified_emails = (
    SELECT
        orig_email AS email,
        $avito_hash(denorm_email) AS email_avito_hash
    FROM (
        SELECT
            $denorm_email(email) AS denorm_emails,
            email AS orig_email
        FROM @final_emails
    ) FLATTEN LIST BY denorm_emails AS denorm_email
);

$avito_hashs_from_emails = (
    select
        a.email as email,
        a.email_avito_hash as avito_hash
    from $all_avitified_emails as a
    left semi join $raw_avito_hashes as b
    on a.email_avito_hash == b.id
);

-- back to default (no default is 1gb, is heavy so much, so use 1/4 of default)
PRAGMA yt.DataSizePerJob = '256M';
PRAGMA yt.DataSizePerMapJob = '256M';
PRAGMA yt.DataSizePerPartition = '256M';

-- insert ids tables
{% for identifier in ('email', 'phone') %}
    {%- for target in ('', '_md5', '_sha256') %}

$ids_{{ identifier + target }} = $out_ids_dir || '/{{ identifier + target }}';
INSERT INTO $ids_{{ identifier + target }} WITH TRUNCATE
SELECT
    id,
    EvaluateExpr(IdType::{{ (identifier + target)|upper }}()) AS id_type
FROM (
    SELECT id
    FROM ${{ identifier }}_md5_sha256
    GROUP BY {{ identifier + target }} AS id
);

    {%- endfor -%}
{% endfor %}

$ids_avito_hash = $out_ids_dir || '/avito_hash';
insert into $ids_avito_hash with truncate
select
    avito_hash as id,
    EvaluateExpr(IdType::AVITO_HASH()) as id_type
from $avito_hashs_from_emails;

-- insert soup tables
$get_soup_table = ($id1_type, $id2_type, $source_type) -> {
    $table_name = $id1_type || '_' || $id2_type || '_' || $source_type || '_' || EvaluateExpr(LogSource::SOUP_PREPROCESSING());
    RETURN $out_soup_dir || '/' || $table_name;
};

{% for identifier in ('email', 'phone') %}
    {%- for target in ('md5', 'sha256') %}

${{ identifier }}_to_{{ target }} = $out_soup_dir || '/{{ out_tables[identifier, target] }}';
INSERT INTO ${{ identifier }}_to_{{ target }} WITH TRUNCATE
SELECT
    id1,
    EvaluateExpr(IdType::{{ identifier|upper }}()) AS id1Type,
    id2,
    EvaluateExpr(IdType::{{ '{}_{}'.format(identifier, target)|upper }}()) AS id2Type,
    EvaluateExpr(SourceType::{{ target|upper }}_HASH()) AS sourceType,
    EvaluateExpr(LogSource::SOUP_PREPROCESSING()) AS logSource,
    ListCreate(String) AS dates
FROM (
    SELECT id1, id2
    FROM ${{ identifier }}_md5_sha256
    GROUP BY 
        {{ identifier }} AS id1,
        {{ '{}_{}'.format(identifier, target) }} AS id2
);

    {%- endfor %}

${{ identifier }}_to_md5_sha256 = $out_soup_dir || '/{{ out_tables[identifier, 'md5', 'sha256'] }}';
INSERT INTO ${{ identifier }}_to_md5_sha256 WITH TRUNCATE
SELECT
    id1,
    EvaluateExpr(IdType::{{ identifier|upper }}_MD5()) AS id1Type,
    id2,
    EvaluateExpr(IdType::{{ identifier|upper }}_SHA256()) AS id2Type,
    EvaluateExpr(SourceType::HASH_TO_HASH()) AS sourceType,
    EvaluateExpr(LogSource::SOUP_PREPROCESSING()) AS logSource,
    ListCreate(String) AS dates
FROM (
    SELECT id1, id2
    FROM ${{ identifier }}_md5_sha256
    GROUP BY 
        {{ identifier }}_md5 AS id1,
        {{ identifier }}_sha256 AS id2
);

{%- endfor %}

{% for (id1_type, id2_type) in (
    ('login', 'email', ), ('email', 'phone', ), ('email', 'avito_hash', ), ) %}
${{ id1_type }}_{{ id2_type }} = $out_soup_dir || '/{{ out_tables[id1_type, id2_type] }}';
INSERT INTO ${{ id1_type }}_{{ id2_type }} WITH TRUNCATE
SELECT
    {{ id1_type }} AS id1,
    EvaluateExpr(IdType::{{ id1_type|upper }}()) AS id1Type,
    {{ id2_type }} AS id2,
    EvaluateExpr(IdType::{{ id2_type|upper }}()) AS id2Type,
    EvaluateExpr(SourceType::
        {%- if id2_type == 'avito_hash' -%}
            AVITO
        {%- else -%}
            {{ id1_type|upper }}_TO_{{ id2_type|upper }}
        {%- endif -%}
    ()) AS sourceType,
    EvaluateExpr(LogSource::SOUP_PREPROCESSING()) AS logSource,
    ListCreate(String) AS dates
FROM ${{ id2_type }}s_from_{{ id1_type }}s;

{% endfor %}


-- 5. Clean yandexuid_email_sender_sender to yandexuid_email_sender_preproc
$input_sender_table = $soup_dir || '/' || String::JoinFromList(AsList(
    EvaluateExpr(IdType::YANDEXUID()),
    EvaluateExpr(IdType::EMAIL()),
    EvaluateExpr(SourceType::EMAIL_SENDER()),
    EvaluateExpr(LogSource::SENDER_LOG())
), '_');

$output_sender = $out_soup_dir || '/{{ out_tables['yandexuid', 'email'] }}';
INSERT INTO $output_sender WITH TRUNCATE
SELECT
    t.id1 AS id1,
    t.id1Type AS id1Type,
    t.id2 AS id2,
    t.id2Type AS id2Type,
    t.sourceType AS sourceType,
    EvaluateExpr(LogSource::SOUP_PREPROCESSING()) AS logSource,
    t.dates AS dates
FROM $input_sender_table AS t
LEFT SEMI JOIN (
        SELECT
            id2
        FROM $input_sender_table
        GROUP BY id2
        HAVING COUNT(1) <= 50
    ) AS w USING(id2)
;

$output_icookie_hash = $out_soup_dir || '/{{ out_tables['uuid', 'icookie'] }}';
INSERT INTO $output_icookie_hash WITH TRUNCATE
SELECT
    `uuid` AS id1,
    EvaluateExpr(IdType::UUID()) AS id1Type,
    Identifiers::HashIcookie(`uuid`) AS id2,
    EvaluateExpr(IdType::ICOOKIE()) AS id2Type,
    EvaluateExpr(SourceType::ICOOKIE_HASH()) AS sourceType,
    EvaluateExpr(LogSource::SOUP_PREPROCESSING()) AS logSource,
    ListCreate(String) AS dates
FROM (
    SELECT `uuid`
    FROM `{{ uuid_eternal_tbl }}`
    WHERE Identifiers::IsValidUuid(id)
        AND (
            app_id LIKE 'ru.yandex.searchplugin%'  -- pp android
            OR app_id LIKE 'ru.yandex.browser%'    -- ybro android
            OR app_id LIKE 'com.yandex.browser%'   -- ybro android
            OR app_id LIKE 'ru.yandex.mobile'      -- pp ios / ybro ios -- wo % cause eq
            OR app_id LIKE 'ru.yandex.mobile.search%'    -- pp ios / ybro ios
        )
    GROUP BY Identifiers::NormalizeUuid(id) AS `uuid`
);
