PRAGMA AnsiInForEmptyOrNullableItemsCollections;

$table_list = ?;
$base_folder = ?;
$start = SELECT (DateTime::ToSeconds(CurrentTzDatetime("Europe/Moscow")) / 3600) * 3600;
$yacofast_folder = $base_folder || "/export/content_categories/yacofast_data";
$yacofast_table = $base_folder || "/export/content_categories/yacofast_data/tags_data_direct_" || CAST($start AS String);
$whitelist_table = $base_folder || "/export/content_categories/whitelist_categories";
$blocked_domains = $base_folder || "/export/content_categories/categories_blocked_domains";
$brand_safety_max_id = 262144;

$sources = (
    SELECT
        CAST(urlmd5 AS String) AS urlmd5,
        url,
        source,
        NVL(catalogia_cats, ListCreate(String)) AS catalogia_cats
    FROM
        EACH(String::SplitToList($table_list, ",")));

$yacofast_tables_count = (
    SELECT COUNT(*)
    FROM FOLDER($yacofast_folder)
    WHERE Type = "table");

-- checking the case of empty yacofast folder, i.e. first launch or some error
EVALUATE IF $yacofast_tables_count > 0 DO BEGIN
    INSERT INTO @data
    (SELECT * FROM $sources)
    UNION ALL
    (SELECT
        urlmd5,
        GroupingUrl AS url,
        IF(BrandSafetyTagsOutstream IS NOT NULL OR ContentCategoriesOutstream IS NOT NULL, "direct", NULL) AS source,
        ListExtend(String::SplitToList(String::ReplaceAll(String::ReplaceAll(ContentCategoriesOutstream, '[\"', ''), '\"]', ''), '\",\"'),
            String::SplitToList(String::ReplaceAll(String::ReplaceAll(BrandSafetyTagsOutstream, '[{\"id\": \"', ''), '\"}]', ''), '\"}, {\"id\": \"')) as catalogia_cats
    FROM RANGE($yacofast_folder)
    WHERE urlmd5 IN (SELECT urlmd5 FROM $sources))
END DO
ELSE DO BEGIN
    INSERT INTO @data
    SELECT * FROM $sources
END DO;
COMMIT;


$whitelist = (
    SELECT
        urlmd5,
        url,
        String::SplitToList(MAX_BY(plus_categories, LENGTH(url_pattern)), ",", true, true) AS plus_categories,
        String::SplitToList(MAX_BY(minus_categories, LENGTH(url_pattern)), ",", true, true) AS minus_categories
    FROM (
        SELECT
            urlmd5,
            url,
            a.*
        FROM
            (SELECT
                urlmd5,
                url,
                AsList(ListConcat(ListTake(url_parts, 1), "/"),
                    ListConcat(ListTake(url_parts, 2), "/"),
                    ListConcat(ListTake(url_parts, 3), "/"),
                    ListConcat(ListTake(url_parts, 4), "/")) AS url_patterns
            FROM (
                SELECT
                    urlmd5,
                    url,
                    ListExtend(AsList(Url::GetHost(url)),
                        CAST(String::SplitToList(Url::GetPath(url), "/", True, True) AS List<Optional<String>>)) AS url_parts,
                FROM @data)) AS d
        FLATTEN LIST BY url_patterns AS url_pattern
        JOIN (
            SELECT *
            FROM $whitelist_table) AS a ON a.url_pattern = d.url_pattern)
    GROUP BY CAST(urlmd5 AS String) AS urlmd5, url);

$is_brandsafety = ($x) -> { RETURN CAST($x AS Int64) <= $brand_safety_max_id; };
$is_content_cats = ($x) -> { RETURN CAST($x AS Int64) > $brand_safety_max_id; };
$convert_bs = ($x) -> {
    RETURN String::JoinFromList(ASList('[{\"id\": \"', String::JoinFromList(ListFilter($x, $is_brandsafety),'\"}, {\"id\": \"'), '\"}]'), '')
};
$convert_cats = ($x) -> {
    RETURN String::JoinFromList(ASList('[\"', String::JoinFromList(ListFilter($x, $is_content_cats),'\",\"'), '\"]'), '')
};
$aggregation = AggregateFlatten(AggregationFactory("AGGREGATE_LIST_DISTINCT"));

INSERT INTO $yacofast_table WITH TRUNCATE
SELECT
    url AS GroupingUrl,
    IF(ListHasItems(ListFilter(catalogia_cats, $is_content_cats)) AND d.domain IS NULL, $convert_cats(catalogia_cats), NULL) AS ContentCategoriesOutstream,
    NULL AS ContentGenresOutstream,
    NULL AS BrandSafetyTagsOutstream,
    urlmd5
FROM (
    SELECT
        urlmd5,
        url,
        IF(ListHasItems(AGGREGATE_LIST(source)), "direct", NULL) AS source,
        COALESCE(AGGREGATE_BY(catalogia_cats, $aggregation), []) AS catalogia_cats
    FROM (
        SELECT
            d.urlmd5 AS urlmd5,
            d.url AS url,
            d.source AS source,
            COALESCE(DictKeys(SetDifference(
                ToSet(ListExtend(IF(d.catalogia_cats IS NULL, [], d.catalogia_cats), IF(w.plus_categories IS NULL, [], w.plus_categories))),
                ToSet(IF(w.minus_categories IS NULL, [], w.minus_categories))
            )), []) AS catalogia_cats
        FROM @data AS d
        LEFT JOIN $whitelist AS w ON w.urlmd5 = d.urlmd5)
    GROUP BY urlmd5, url) AS u
LEFT JOIN $blocked_domains AS d ON Url::GetHost(u.url) = d.domain;
