-- WebVisor 2.0 parsing process

-- skip warning for YQL::Udf for construct protobuf parser
{% if not is_embedded %}
PRAGMA Library = "wv_lib.sql";
PRAGMA Library = "proto_lib.sql";
{% endif %}

IMPORT {% if is_embedded %}.tmp.{% endif %}wv_lib SYMBOLS
    $wv_parse_email, $wv_parse_phone, $wv_parse_date, $id_symbols_only, $is_bad_phone,
    $is_bad_date, $get_url_login_tokens, $get_field_login_tokens, $is_login;
IMPORT {% if is_embedded %}.tmp.{% endif %}proto_lib SYMBOLS $page_parse, $event_parse;
-- ========================================================================== --
$webvisor_log_src = '{{ webvisor_log_dir }}/{{ date }}';
$webvisor_processed_out = '{{ webvisor_processed }}';
$yuid_raw_email_out = '{{ yuid_raw_email }}';
$yuid_raw_phone_out = '{{ yuid_raw_phone }}';
$soup_table = '{{ soup_table }}';

$WEBVISOR_TOLERANCE = {{ webvisor_tolerance }};
$DATE_TYPE = "date";

-- ========================================================================= --
$null_uid = "0";

$webvisor_log = (
    SELECT
        `timestamp` AS ts,
        CAST(counterId AS String) AS counterid,
        hit,
        type,
        CAST(IF(userId == duid, 0, userId) AS String) ?? $null_uid AS yuid, -- deduplicate userid from duid
        CAST(duid AS String) ?? $null_uid AS duid,
        IF(type == 'event', $event_parse(data), $page_parse(data)) AS proto,
        url,
    FROM $webvisor_log_src
    WITH COLUMNS Struct<
        url: String?
    >
);

-- ========================================================================= --

$parse_proto_wv2_log = (
    SELECT
        yuid,
        duid,
        counterid,
        hit,
        ts,
        url,
        IF(type == 'event', AsList(proto)) AS events,
        IF(type == 'page' AND proto IS NOT NULL,
            AsList(
                AsStruct(
                    Unwrap(proto.content) AS content
                )
            )
        ) AS pages,
    FROM $webvisor_log
);

-- ========================================================================== --

$select_events = (
    SELECT yuid, duid, counterid, hit, target, MAX_BY(value, IF(value IS NOT NULL, ts)) AS event_value
    FROM (
        SELECT
            yuid,
            duid,
            counterid,
            hit,
            -- parse event data
            event.changeEvent.value AS value,
            event.target AS target,
            ts
        FROM $parse_proto_wv2_log
        FLATTEN LIST BY events AS event
    )
    GROUP BY yuid, duid, counterid, hit, target
    HAVING MAX_BY(value, IF(value IS NOT NULL, ts)) IS NOT NULL
);

$parse_ids_from_events = (
    SELECT * FROM (
        SELECT
            yuid, duid, counterid, hit, target,  -- Key row
            AsStruct(
                $wv_parse_email(event_value) AS email,
                $wv_parse_phone(event_value) AS phone,
                $wv_parse_date(event_value) AS dt
            ) AS value
        FROM $select_events
    ) WHERE
        -- NOT NULL AND ONLY ONE
        ListLength(
            ListFilter(
                AsList(value.dt, value.email, value.phone),
                ($each) -> {
                    RETURN $each IS NOT NULL;
                }
            )
        ) == 1
);

$select_pages = (
    SELECT
        yuid, duid, counterid, hit, target,
        AsStruct(
            element.name AS tag,
            $id_symbols_only(element.attr['id']) AS id,
            $id_symbols_only(element.attr['name']) AS name,
            IF(element.attr['type'] IS NOT NULL, Unwrap(element.attr['type'])) AS type
        ) AS element
    FROM (
        SELECT yuid, duid, counterid, hit, target, SOME(html_element) AS element
        FROM (
            SELECT
                yuid, duid, counterid, hit,
                html_block.id AS target,
                AsStruct(
                    html_block.id AS id,
                    html_block.name AS name,
                    ToDict(ListMap(
                        html_block.attributes,
                        ($each) -> {
                            RETURN AsTuple($each.key, $each.value);
                        }
                    )) AS attr
                ) AS html_element
            FROM (
                SELECT yuid, duid, counterid, hit, page.content as page_content
                FROM (
                    SELECT
                        webvisor.yuid AS yuid,
                        webvisor.duid AS duid,
                        webvisor.counterid AS counterid,
                        webvisor.hit AS hit,
                        webvisor.pages AS pages
                    FROM $parse_proto_wv2_log AS webvisor
                    LEFT SEMI JOIN $parse_ids_from_events AS events
                    USING (yuid, duid, counterid, hit)
                ) FLATTEN LIST BY pages AS page
            ) FLATTEN LIST BY page_content AS html_block
        ) GROUP BY yuid, duid, counterid, hit, target
    )
);

$select_url_ts = (
    SELECT
        yuid, duid, counterid, hit,
        MIN(ts) AS ts,
        MAX_BY(url, IF(url IS NOT NULL, ts)) AS url,
    FROM $parse_proto_wv2_log
    GROUP BY yuid, duid, counterid, hit
);

-- ========================================================================== --

$parsed_records = (
    SELECT
        counterid,
        hit,
        yuid,
        duid,
        id_type,
        id_value,
        SOME(tag) AS tag
    FROM (
        SELECT
            -- key row
            event.counterid AS counterid,
            event.hit AS hit,
            event.yuid AS yuid,
            event.duid AS duid,
            html.element AS tag,
            (value.dt ?? value.email ?? value.phone) AS id_value,
            CASE
                WHEN value.email IS NOT NULL THEN IdType::EMAIL()
                WHEN value.phone IS NOT NULL THEN IdType::PHONE()
                WHEN value.dt IS NOT NULL THEN $DATE_TYPE
                ELSE NULL
            END AS id_type,
        FROM $parse_ids_from_events AS event
        LEFT JOIN $select_pages AS html
        USING (counterid, hit, yuid, duid, target)
    )
    WHERE id_value IS NOT NULL
    GROUP BY counterid, hit, yuid, duid, id_type, id_value
);

$combined_records = (
    SELECT
        counterid, hit, yuid, duid, id_type,
        SOME(id_value) AS id_value,
        SOME(tag) AS tag,
    FROM $parsed_records
    GROUP BY counterid, hit, yuid, duid, id_type
    HAVING
        -- NO TOO MUCH VALUES
        COUNT(DISTINCT IF(id_type==IdType::EMAIL(), id_value)) <= 1
        AND COUNT(DISTINCT IF(id_type==IdType::PHONE(), id_value)) <= 1
        AND COUNT(DISTINCT IF(id_type==$DATE_TYPE, id_value)) <= 1
        -- HAS ANY VALUE
        AND SOME(id_value) IS NOT NULL
);

$finnaly_parsed_vw_log = (
    SELECT
        combined.yuid AS yuid,
        combined.duid AS duid,
        combined.id_type AS id_type,
        combined.id_value AS id_value,
        combined.tag AS tag,
        url_ts.ts AS ts,
        url_ts.url AS url,
    FROM $combined_records AS combined
    INNER JOIN $select_url_ts AS url_ts
    USING (counterid, hit, yuid, duid)
);

-- ========================================================================== --

$webvisor_processed =
SELECT
    yuid,
    duid,
    CAST(ts / 1000 AS String) AS ts,
    url,
    id_type,
    id_value,
    "webvisor" AS source_type,
    tag.id AS field_id,
    tag.tag AS field_tag,
    tag.name AS field_name,
    "{{ date }}" AS id_date,
    CASE
        WHEN id_value IS NULL THEN false
        WHEN id_type == IdType::EMAIL() THEN EndsWith(id_value, "@yandex-team.ru")
        WHEN id_type == IdType::PHONE() THEN $is_bad_phone(id_value)
        WHEN id_type == $DATE_TYPE THEN $is_bad_date(id_value)
        ELSE false
    END AS bad_value,
    Url::Parse(url).Host AS domain,
    (tag.id == "" AND tag.name == "") AS empty,
    $get_url_login_tokens(url) AS url_login_tokens,
    $get_field_login_tokens(tag.id, tag.name) AS field_login_tokens,
    $is_login(url, tag.id, tag.name) AS is_login,
FROM $finnaly_parsed_vw_log
WHERE id_type IN {IdType::EMAIL(), IdType::PHONE()}
ORDER BY yuid;


INSERT INTO $webvisor_processed_out WITH TRUNCATE
SELECT * FROM $webvisor_processed;

-- ========================================================================== --

$filtered_edges =
SELECT
    IF(yuid == duid, Identifiers::NormalizeDuid(duid), Identifiers::NormalizeYandexuid(yuid)) AS uid,
    IF(yuid == duid, IdType::DUID(), IdType::YANDEXUID()) AS uid_type,
    id_type,
    IF(id_type == IdType::EMAIL(), Identifiers::NormalizeEmail(id_value), Identifiers::NormalizePhone(id_value)) AS id_value,
    id_date,
    source_type,
FROM $webvisor_processed
WHERE
    bad_value == False
    AND (is_login OR $WEBVISOR_TOLERANCE AND empty)
    AND IF(yuid == duid, Identifiers::IsValidDuid(duid), Identifiers::IsValidYandexuid(yuid))
    AND id_type in {IdType::EMAIL(), IdType::PHONE()}
    AND (
        id_type == IdType::EMAIL() AND Identifiers::IsValidEmail(id_value)
        OR id_type == IdType::PHONE() AND Identifiers::IsValidPhone(id_value)
    );


$limit_of_ids = 100;

$limited_ids =
SELECT
    uid,
    uid_type,
    id_type,
    id_value,
    source_type,
    id_date,
    id_count,
FROM (
     SELECT
        uid,
        uid_type,
        id_type,
        AGGREGATE_LIST_DISTINCT(id_value, $limit_of_ids) AS ids,
        SOME(id_date) AS id_date,
        SOME(source_type) AS source_type,
        COUNT(*) AS id_count,
     FROM $filtered_edges
     GROUP BY uid, uid_type, id_type
) FLATTEN LIST BY ids AS id_value;

-- ========================================================================== --
-- make yuid raw tables

DEFINE ACTION $make_yuid_raw($out_table, $uid_type, $id_type) AS

    INSERT INTO $out_table WITH TRUNCATE
    SELECT
        uid AS yuid,
        id_type,
        id_value,
        id_count,
        id_date,
        source_type
    FROM $limited_ids
    WHERE uid_type == $uid_type AND id_type == $id_type
    ORDER BY yuid;

END DEFINE;

DO $make_yuid_raw($yuid_raw_email_out, IdType::YANDEXUID(), IdType::EMAIL());
DO $make_yuid_raw($yuid_raw_phone_out, IdType::YANDEXUID(), IdType::PHONE());

-- ========================================================================== --
-- make soup edges

INSERT INTO $soup_table WITH TRUNCATE
SELECT
    LogSource::WEBVISOR_LOG() AS logSource,
    SourceType::WEBVISOR() AS sourceType,
    uid AS id1,
    uid_type AS id1Type,
    id_value AS id2,
    id_type AS id2Type,
    -- make nullable fields to allow python append into soup
    1 AS dayHits,
    Null AS dayActivity
FROM $limited_ids;
