USE hahn;
PRAGMA yt.MaxRowWeight="32M";

$format = DateTime::Format("%Y-%m-%d");
$kwyt_input = "//home/kwyt/hosts/robots";
$output_path = "//home/webmaster/prod/kwyt/robots/updates/" || $format(CurrentUtcDate());
$snapshot_path = "//home/webmaster/prod/kwyt/robots/updates/snapshot";
$last_state_path = "//home/webmaster/prod/kwyt/robots/updates/last-state";

$filtered_kwyt_robots = (
    SELECT k.Host as Host, Unicode::Substring(CAST(RobotsResponseBody as Utf8), 0, 40960) as Content,
        k.LastAccess as LastAccess FROM
    $kwyt_input as k
    LEFT SEMI JOIN `//home/webmaster/prod/export/webmaster-hosts` AS w ON k.Host == w.Host
    LEFT JOIN $last_state_path as ls ON k.Host == ls.Host
    WHERE RobotsHTTPCode = 200 and k.LastAccess > nvl(ls.LastAccess, 0)
);

$all_robotstxt = (
    SELECT Host, Content, LastAccess
    FROM $filtered_kwyt_robots as kwyt
    UNION ALL
    SELECT Host, Content, LastAccess
    FROM $last_state_path
);

$diff = (
    SELECT Host, LastAccess, Content FROM (
        SELECT
            Host,
            LastAccess,
            Content,
            LAG(Content) OVER w AS PrevContent
        FROM $all_robotstxt
        WINDOW w AS (PARTITION BY Host ORDER BY LastAccess)
    )
    WHERE Content != PrevContent
);

$new_last_state = (
    SELECT Host, MAX(LastAccess) as LastAccess, MAX_BY(Content, LastAccess) as Content
    FROM $all_robotstxt
    GROUP BY Host
);

-- delta
INSERT INTO $output_path WITH TRUNCATE
SELECT * FROM $diff
ORDER BY Host, LastAccess;

-- new last state
INSERT INTO $last_state_path WITH TRUNCATE
SELECT * FROM $new_last_state
ORDER BY Host, LastAccess;

-- update snapshot
INSERT INTO $snapshot_path
SELECT * FROM $diff
ORDER BY Host, LastAccess;
