use arnold;
PRAGMA AnsiInForEmptyOrNullableItemsCollections;

$current_ts = @[current_ts];
$fielddate = "@[date]";

$direct_index = "//home/videoindex/full/docbase/prevdata/full_index/direct_index";
$dupl_table = "//home/videoindex/bennydictor/vh_index_titles/vh.index.title_match.match";
$rus = Re2::Grep("[а-яА-ЯёЁ]");

$stripPrefix = ($s, $prefix) -> (IF(
    $s like ($prefix || "%"),
    substring($s, unwrap(length($prefix))),
    $s
));

$getHost = ($url) -> {
    $host = Url::GetHost($url);
    $host = $stripPrefix($host, "m.");
    $host = $stripPrefix($host, "www.");
    return $host
};

$platinum_yt = (
    select GroupingUrl
    from `//home/videoindex/export/extra_docflags`
    where IsPlatinum and $getHost("https://" || GroupingUrl) == "youtube.com"
);

$platinum_ultra_map = (
    select GroupingUrl, title, $current_ts - ctime <= 86400 as last_24_hr,
    mediainfo.AuthorId as author_id
    from `//home/videoindex/ultra/docbase/dynamic/direct_index` as di
    left semi join $platinum_yt as p using (GroupingUrl)
    where $rus(di.title)
);

$platinum_index_map = (
    select GroupingUrl, title, $current_ts - ctime <= 86400 as last_24_hr,
    mediainfo.AuthorId as author_id
    from $direct_index as di
    left semi join $platinum_yt as p on (di.GroupingUrl == p.GroupingUrl)
    left only join $platinum_ultra_map as u on (di.GroupingUrl == u.GroupingUrl)
    where $rus(di.title)
);

$platinum_map_concat = (
    select * from $platinum_ultra_map
    union all
    select * from $platinum_index_map
);

$dups_map = (
    select
        vh_url, some(Url2) as dupl_url
    from $dupl_table
    where $getHost(Url2) == "youtube.com"
    group by Url1 as vh_url
);

$content_dups_map = (
    select
        GroupingUrl
    from `//home/videoindex/duplicates/content_dups/prevdata/content.dups`
    where urlHash != urlBaseHash
);

$getChannelId = ($tags) -> {
    $text = ListMap($tags, ($x)->($x.Text));
    $channel_id = ListFilter($text, ($x)->($x LIKE "voditem_channel_id_%"));
    return IF(
        ListLength($channel_id) > 0,
        substring(unwrap($channel_id[0]), 19),
        null
    )
};

$getChannelIdFromChain = ($chain) -> {
    $chain = Yson::ConvertToList($chain);
    $channels = ListFilter($chain, ($x)->(Yson::LookupInt64($x, "ContentTypeID") == 2));
    return IF(
        ListLength($channels) >= 1,
        Yson::LookupString(unwrap(ListReverse($channels)[0]), "UUID"),
        null
    )
};

$difull_authors = (
    select GroupingUrl,
        mediainfo.AuthorId as author_id
    from $direct_index
    where $getHost(GroupingUrl) == "youtube.com"
);

$di_map = (
    select
        di.title as title,
        di.GroupingUrl as GroupingUrl,
        ListReverse(String::SplitToList(di.GroupingUrl, "/"))[0] as JoinKey,
        omniData.VhUuid as `UUID`,
        $current_ts - ctime <= 86400 as last_24_hr,
        AsTuple(omniData.VhUuid, dups.dupl_url) as dups_pair,
        dups.vh_url is not null as has_yt_dup,
        difull.author_id as yt_author_id,
        $getChannelId(mediainfo.OvsDetailedTags) as channel_id_di,
        cdups.GroupingUrl is not null as has_content_dup,
        (mediainfo.Author.Host == "www.youtube.com") ?? false as has_yt_source
        -- vhdups.vh_url is not null as has_vh_dup
    from `//home/videoindex/vhs/docbase/dynamic/direct_index` as di
    left join any $dups_map as dups on (di.GroupingUrl == dups.vh_url)
    left join any $content_dups_map as cdups on (di.GroupingUrl == cdups.GroupingUrl)
    left join any $difull_authors as difull on (difull.GroupingUrl == dups.dupl_url)
);

$ib = (
    select JoinKey, computed_channel, duration, IF(
    `UUID` like "v%", CAST(ugc_channel_id as String), $getChannelIdFromChain(chain)
) as channel_id, ugc_owner_id
    from `//home/videolog/strm_meta/iron_branch/concat`
);

$add_data = (
    select di.*,
        computed_channel,
        duration,
        channel_id,
        CASE
        WHEN `UUID` is not null and `UUID` like "v%" then "UGC"
        WHEN `UUID` is not null then "CMS"
        else null
        end as vh_category,
        ugc_owner_id
    from $di_map as di
    left join $ib as ib using (JoinKey)
);

$vh_dups = (
    select
        title,
        duration
    from $add_data
    where vh_category is not null
);

$for_yt_text_dups_non_pt = (
    select * from $vh_dups
    union all
    (
        select title, mediainfo.Duration as duration
        from $direct_index as di
        left only join $platinum_map_concat as pt using (title)
        where $getHost("https://" || GroupingUrl) == "youtube.com"
    )
);

$for_yt_text_dups_platinum = (
    select * from $vh_dups
    union all
    (
        select title, mediainfo.Duration as duration
        from $direct_index as di
        left semi join $platinum_map_concat as pt using (title)
        where $getHost("https://" || GroupingUrl) == "youtube.com"
    )
);

$bad_titles = (
    select title
    from (
        select title, count(*) as `count`
        from (
            select title from $for_yt_text_dups_non_pt
            union all
            select title from $for_yt_text_dups_platinum
        )
        group by title
    )
    where `count` >= 1000
);

$dups_durations = Python::dups_durations(
    Callable<(List<Int64?>)->List<Int64?>>, @@
from collections import Counter

def dups_durations(lst):
    c = Counter([x for x in lst if isinstance(x, int)])
    set_ = set(c.keys())
    set__ = {x for x in set_ if any(
        y for y in set_ if (y != x and x * 0.9 <= y <= x * 1.1) or (y == x and c[y] > 1)
    )}
    return sorted(set__)

@@
);

$vh_dups_reduce = (
    select
        title,
        $dups_durations(AGGREGATE_LIST(duration)) as dups_durations
    from $vh_dups
    group by title
);

$for_yt_text_dups_reduce_non_pt = (
    select
        npt.title as title,
        $dups_durations(AGGREGATE_LIST(duration)) as dups_durations
    from $for_yt_text_dups_non_pt as npt
    left only join $bad_titles as bt using (title)
    group by npt.title
);

$for_yt_text_dups_reduce_platinum = (
    select
        pt.title as title,
        $dups_durations(AGGREGATE_LIST(duration)) as dups_durations
    from $for_yt_text_dups_platinum as pt
    left only join $bad_titles as bt using (title)
    group by pt.title
);

$dups_channels = (
    select vh_category, channel_id,
    some(yt_author_id) as yt_author_id,
    some(dups_pair) as dups_pair,
    some(ugc_owner_id) as ugc_owner_id,
    "https://youtube.com/channel/" || (some(yt_author_id) ?? "") as yt_channel_url,
    IF(
        vh_category == "CMS",
        "https://partner.vh.yandex.ru/statistics/channel?channelId=" || (channel_id ?? ""),
        "https://vh.yandex.ru/support/" || (cast(some(ugc_owner_id) as String) ?? "") || "/channel/" || (channel_id ?? "")
    ) as vh_channel_url,
    from $add_data
    where has_yt_dup
    and not has_yt_source
    and vh_category is not null
    and channel_id is not null
    group by vh_category, channel_id
);

insert into `//home/videolog/channels_with_yt_dups` with truncate
select * from $dups_channels;


$dups_channels_grouped = (
    select vh_category, CountDistinctEstimate(channel_id) as channels_with_yt_dups
    from $dups_channels
    group by vh_category
);

-- $add_data_2 = (
--     select
--         di.*,
--         du.computed_channel is not null as is_yt_dup_2
--     from $add_data as di
--     left join $dups_channels as du using (vh_category, computed_channel)
-- );

$add_data_3 = (
    select
        (
            vh_category is not null and vhd.title is not null and duration in vhd.dups_durations
        ) as has_vh_dup,
        (
            vh_category is not null and vhd.title is not null and duration in vhd.dups_durations and not has_content_dup
        ) as has_vh_dup_not_in_content_dups,
        (
            vh_category is not null and ytd.title is not null and duration in ytd.dups_durations
        ) as has_yt_text_dup,
        (
            vh_category is not null and ytd.title is not null and duration in ytd.dups_durations and not has_content_dup
        ) as has_yt_text_dup_not_in_content_dups,
        (
            vh_category is not null and ytdp.title is not null and duration in ytdp.dups_durations
        ) as has_yt_text_dup_pt,
        (
            vh_category is not null and ytdp.title is not null and duration in ytdp.dups_durations and not has_content_dup
        ) as has_yt_text_dup_not_in_content_dups_pt,
        di.*
    from $add_data as di
    left join $vh_dups_reduce as vhd on (di.title == vhd.title)
    left join $for_yt_text_dups_reduce_non_pt as ytd on (di.title == ytd.title)
    left join $for_yt_text_dups_reduce_platinum as ytdp on (di.title == ytdp.title)
);

$platinum_grouped = (
    select
        "yt_platinum_rus" as vh_category,
        CountDistinctEstimate(GroupingUrl) ?? 0 as documents,
        CountDistinctEstimate(IF(last_24_hr, GroupingUrl)) ?? 0 as documents_last_24_hr
    from $platinum_map_concat
);

$grouped = (
    select
        vh_category,
        CountDistinctEstimate(GroupingUrl) as documents,
        CountDistinctEstimate(IF(last_24_hr, GroupingUrl)) as documents_last_24_hr,
        CountDistinctEstimate(IF(has_vh_dup, GroupingUrl)) as documents_vh_dups,
        CountDistinctEstimate(IF(has_vh_dup_not_in_content_dups, GroupingUrl)) as documents_vh_dups_not_in_content_dups,
        CountDistinctEstimate(IF(has_yt_dup, GroupingUrl)) as documents_yt_dups,
        CountDistinctEstimate(IF(has_yt_text_dup, GroupingUrl)) as documents_yt_text_dups,
        CountDistinctEstimate(IF(has_yt_text_dup_not_in_content_dups, GroupingUrl)) as documents_yt_text_dups_not_in_content_dups,
        CountDistinctEstimate(IF(has_yt_text_dup_pt, GroupingUrl)) as documents_yt_text_dups_pt,
        CountDistinctEstimate(IF(has_yt_text_dup_not_in_content_dups_pt, GroupingUrl)) as documents_yt_text_dups_not_in_content_dups_pt,
        
    from $add_data_3
    where vh_category is not null
    group by vh_category
);

$joined = (
    select g.*, channels_with_yt_dups
    from $grouped as g
    left join $dups_channels_grouped as dc using (vh_category)
);

$concat = (
    select * from $joined
    union all
    select * from $platinum_grouped
);

$concat_add = (
    select
        $fielddate as fielddate,
        documents ?? 0 as documents,
        documents_last_24_hr ?? 0 as documents_last_24_hr,
        documents_vh_dups ?? 0 as documents_vh_dups,
        documents_yt_dups ?? 0 as documents_yt_dups,
        channels_with_yt_dups ?? 0 as channels_with_yt_dups,
        documents_vh_dups_not_in_content_dups ?? 0 as documents_vh_dups_not_in_content_dups,
        documents_yt_text_dups ?? 0 as documents_yt_text_dups,
        documents_yt_text_dups_not_in_content_dups ?? 0 as documents_yt_text_dups_not_in_content_dups,
        documents_yt_text_dups_pt ?? 0 as documents_yt_text_dups_pt,
        documents_yt_text_dups_not_in_content_dups_pt ?? 0 as documents_yt_text_dups_not_in_content_dups_pt,
        c.* without
        documents,
        documents_last_24_hr,
        documents_vh_dups,
        documents_yt_dups,
        channels_with_yt_dups,
        documents_vh_dups_not_in_content_dups,
        documents_yt_text_dups,
        documents_yt_text_dups_not_in_content_dups,
        documents_yt_text_dups_pt,
        documents_yt_text_dups_not_in_content_dups_pt
    from $concat as c
);

-- insert into `//home/videolog/mma-4930-test/2020-08-18` with truncate
-- select * from $concat_add;

upsert into stat.`Video/Others/Strm/MMA-4930-index-dups/daily`
select
    unwrap(fielddate) as fielddate,
    unwrap(vh_category) as vh_category,
    t.* without fielddate, vh_category
from $concat_add as t