$src_table = ?;--test_value="//tmp/darkkeks/sitelinks/import_from_bno_modified";
$dst_table = ?;--test_value="//home/shinyserp/qdsaas/sources/bno:docid_setprops";

$path_separator = '/';
$query_separator = '?';
$fragment_separator = '#';

$cut_before_first_occurence = ($value, $separator) -> {
    RETURN SUBSTRING($value, 0, FIND($value, $separator));
};

$domain = ($url) -> {
    $url = $cut_before_first_occurence($url, $fragment_separator);
    $url = $cut_before_first_occurence($url, $query_separator);
    $url = $cut_before_first_occurence($url, $path_separator);
    return $url;
};

$remove_prefix = ($value, $prefix) -> {
    RETURN IF(String::StartsWith($value, $prefix),
        SUBSTRING($value, LENGTH($prefix)),
        $value);
};

$remove_suffix = ($value, $suffix) -> {
    RETURN IF(String::EndsWith($value, $suffix),
        SUBSTRING($value, 0, LENGTH($value) - LENGTH($suffix)),
        $value);
};

-- Убираем возможные префиксы и лишние символы в конце url, чтобы объединить "одинаковые"
$format = ($url) -> {
    $url = String::ToLower($url);
    $url = $remove_prefix($url, "https://");
    $url = $remove_prefix($url, "http://");
    $url = $remove_prefix($url, "www.");
    $url = $remove_suffix($url, $fragment_separator);
    $url = $remove_suffix($url, $query_separator);
    $url = $remove_suffix($url, $path_separator);
    RETURN $url;
};

$parse_sitelinks = ($snippets) -> {
    -- $items = [[title, url, description], ...];
    $items = ListSort(ListUniq(
        ListMap(
            ListFlatten(ListMap($snippets,
                ($snippet) -> {
                    $json = Yson::ParseJson(SUBSTRING($snippet, LENGTH("Snippet=")));
                    $data = Yson::YPath($json, "//bno/features/data");
                    RETURN Yson::ConvertTo($data, List<List<String>>);
                }
            )),
            ($item) -> {
                RETURN AsList($item[1], $item[0], $item[2]);
            }
        )
    ));
    RETURN Yson::SerializeJson(Yson::From($items));
};

$sitelinks_by_url = (
    SELECT
        url,
        $domain(url) AS domain,
        $parse_sitelinks(AGGREGATE_LIST_DISTINCT(Data_TSKV)) as sitelinks
    FROM $src_table
    GROUP BY $format(Subkey_Url) as url
);

INSERT INTO $dst_table WITH TRUNCATE
SELECT
    url,
    domain,
    SOME(parent.sitelinks) AS sitelinks,
    COUNT(*) - 1 AS subPages
FROM $sitelinks_by_url AS parent
JOIN $sitelinks_by_url AS child
    ON parent.domain = child.domain
WHERE
    String::StartsWith(child.url, COALESCE(parent.url, ""))
-- группируем не только по url, чтобы получить unique_keys=true у таблицы
GROUP BY parent.domain as domain, parent.url AS url
ORDER BY domain, url
