use arnold;
PRAGMA yt.InferSchema = '1';

$root_path = 'home/robot-quality/insignificant_cgis';
$cgi_candidate_path = $root_path || '/webmaster_cgis';
$cgi_samples_path = $root_path || '/webmaster_sample';
$goodCGI = Re2::Match('[-_a-zA-Z\\[\\]0-9]{1,}');

$last_exists_table = (
    select String::SplitToList(TableName(Path),".")[1]  as tn
    FROM FOLDER($cgi_candidate_path, "schema;row_count")
    where TableName(Path) != 'webmaster_cgis.latest'
    order by tn desc
    limit 1
);

$last_processed_table = (
    select value from (
        select value,`timestamp`
        FROM `//home/webmaster/prod/service/common_data_state`
        where type = 'INSIGNIFICANT_CGI_PARAMETERS_PROCESSING'
        order by `timestamp` desc
        limit 1
    )
);


$output_path = "home/webmaster/prod/checklist/cgi";
$output_table = $output_path || "/" || $last_exists_table;

$last_output_table = $output_path || "/" || $last_processed_table;

$cgi_candidate = $cgi_candidate_path || "/webmaster_cgis." || $last_exists_table;
$cgi_samples = $cgi_samples_path || "/webmaster_sample." || $last_exists_table;
    

$cgi_example = ( 
    select cgi.Host as Host, cgi.CGI as CGI, cgi.`Rank` as `Rank`, samples.PathForValidation as samples
    from $cgi_samples with schema Struct<Host:String,CGI:String,PathForValidation:String,HttpCode:Uint32> as samples
    inner join $cgi_candidate with schema Struct<Host:String,CGI:String,`Rank`:Double> as cgi
        on samples.Host = cgi.Host and samples.CGI = cgi.CGI
    inner join  `//home/webmaster/prod/export/archive/webmaster-verified-hosts-latest` as w
        on w.host_url = samples.Host
    where samples.HttpCode between 200 and 299 and $goodCGI(cgi.CGI)
);
$grouped_samples = (
    select Host,CGI,`Rank`,AGGREGATE_LIST_DISTINCT(samples,5) as samples
    from $cgi_example
    group by Host,CGI,`Rank`
);

$query = (select sum(count_new) as total_new,sum(count_removed) as total_removed, sum(cnt) as total, 
from (
    select if (old.Host is null,1,0) as count_new, 
       if(new.Host is null,1,0) as count_removed,
       1 as cnt 
    from $last_output_table as old
    full join $grouped_samples as new on new.Host = old.Host and old.CGI = new.CGI
)
);


DEFINE ACTION $process_table() AS

    --DISCARD select ENSURE(True, cast(total_new as Double)/total < 0.15 or cast(total_removed as Double)/total < 0.15,"Changes is too big") from $query;
    commit;
    insert into $output_table
    select Host,CGI,samples from $grouped_samples
    order by Host,CGI;
    commit;

    insert into `//home/webmaster/prod/service/common_data_state`
    select "INSIGNIFICANT_CGI_PARAMETERS_PROCESSING" as type, CurrentUtcDateTime() as `timestamp`, coalesce(cast($last_exists_table as String),"") as value;

    commit;

END DEFINE;

EVALUATE IF $last_exists_table > $last_processed_table or $last_processed_table is null
DO $process_table();

