-- https://a.yandex-team.ru/arcadia/products/analytics/charts/category_coverage

pragma yt.Pool = "goods_quality";
pragma yt.PoolTrees = "physical";
pragma yt.OperationSpec = "{reducer={cpu_limit=0.05}}";
pragma yt.DefaultMemoryLimit = "5G";
pragma yt.DataSizePerJob = "10G";

--/*
$date = '${global.date}';
$outputdir = '${global.outputdir_parallel}';
$force_reparse_us = ${global.force_reparse_us};
$force_redo_aggregate = ${global.force_redo_aggregate};
$disable_if_no_parsed_us = ${global.disable_if_no_parsed_us};
$disable_form_merged_sorted = ${global.disable_form_merged_sorted};
--*/

/*
$date = '2022-07-01';
$outputdir = '//home/goods_quality/charts/category_coverage/parallel';
$force_reparse_us = False;
$force_redo_aggregate = False;
$disable_if_no_parsed_us = False;
$disable_form_merged_sorted = True;  -- avoid taking exclusive lock for parallel execution
--*/

/* ======================================================================== */

$input = '//user_sessions/pub/search/daily/' || $date || '/columns/clean';
$outputdir_cache = $outputdir || '/cache/' || $date;
$output_cache_us = $outputdir_cache || '/user_session';
$output_cache_aggr = $outputdir_cache || '/aggregation';
$output_merged = $outputdir || '/category_coverage_merged';
$output_merged_sorted = $outputdir || '/category_coverage_merged_sorted';

/* ======================================================================== */

/*
Based on:
Parsing web usersessions
  - https://yql.yandex-team.ru/Operations/YqW3rLq3k4fbxieGbQ6PwJyYTqX166HA-Q1mPN_8v6U=
  - https://yql.yandex-team.ru/Operations/YZpa5wB_AjbPFrTgzhNjdz7tTuSEO7VS4JQTdPyxewM=
  - https://yql.yandex-team.ru/Operations/Ymv0vbq3kwZYo4bDVsfMrfbNe_9ZEr94HxK4g0-LxIE=
https://wiki.yandex-team.ru/ecomquality/faq/sbor-pula-i-obuchenie-ranzhirovanija-vnutri-koldunshhika/
  - https://a.yandex-team.ru/svn/trunk/arcadia/market/analytics/market_search/parallel_clickpool
*/

use hahn;

pragma yt.DefaultMaxJobFails = '1';
pragma yt.InferSchema = '10';
pragma yson.DisableStrict;
pragma SimpleColumns;

-- https://yql.yandex-team.ru/docs/yt/interfaces/web?searchQuery=arc#attach
--pragma File("libra.py", "https://paste.yandex-team.ru/10357253/text");
pragma File("libra.py", "arc://products/analytics/charts/category_coverage/lib/libra_parallel.py?rev=9680191");

pragma File("blockstat.dict", "yt://current/statbox/statbox-dict-last/blockstat.dict");
pragma File("libra.so", "yt://hahn/statbox/resources/libra_nile_udf2.7.so");
pragma UDF("libra.so");

$preprocess = LibraCustomPython::MakeLibraPreprocessor(
    AsStruct(
        FilePath("blockstat.dict") AS BlockstatDict,
        AsList("Baobab", "Rearr", "Relev", "Web") AS Entities,
        "whitelist" AS EntitiesStrategy,
        false AS ParseWithFat
    )
);

$parse_sessions = CustomPython::parse_sessions(
    Callable<(Resource<'LibraCustomPythonEventsGroup'>)->Stream<Struct<
        key:String,
        subkey:Uint64,

        uid:String,
        ts:Uint64,
        reqid:String,
        query:String,
        user_region:Int64,

        url:String?,

        -- classifiers
        wizdetection_ecom_classifier_prob:String?,
        wizdetection_cehac_ecom_classifier_prob:String?,
        wizdetection_fashion_ecom_classifier_prob:String?,
        wizdetection_home_ecom_classifier_prob:String?,
        wizdetection_diy_ecom_classifier_prob:String?,
        wizdetection_pharma_ecom_classifier_prob:String?,
        query_about_one_product:String?,
        query_about_many_products:String?,

        -- rules
        GoodsCategoryFilterWorkedRuleMarker:String?,
    >>>,
    FileContent("libra.py")
);

$reducer = ($key, $rows) -> {
    return $parse_sessions($preprocess($key, $rows));
};


define action $parse_us($input, $output) as

$us_res = (
    REDUCE $input
    PRESORT subkey
    ON key
    USING $reducer(TableRow())
);

insert into $output with truncate
select *
from $us_res;

end define;

/* ======================================================================== */

$max = ($a, $b) -> {
	return IF($a > $b, $a, $b);
};


define action $aggregate_statistics($input, $output) as

$input_extra = (
select t.*,
  (GoodsCategoryFilterWorkedRuleMarker is not null) as is_covered,
  (     cast(wizdetection_ecom_classifier_prob as double) >= 0.2
    or  cast(query_about_many_products as double) >= 0.1
    or  cast(query_about_one_product as double) >= 0.05
  ) as is_ecom,
from $input as t
);

$input_classifiers = (
select t.*,
  ((CAST(wizdetection_cehac_ecom_classifier_prob as double)   > 0.44) and is_ecom) as is_cehac,
  ((CAST(wizdetection_diy_ecom_classifier_prob as double)     > 0.40) and is_ecom) as is_diy,
  ((CAST(wizdetection_fashion_ecom_classifier_prob as double) > 0.40) and is_ecom) as is_fashion,
  ((CAST(wizdetection_home_ecom_classifier_prob as double)    > 0.30) and is_ecom) as is_home,
  ((CAST(wizdetection_pharma_ecom_classifier_prob as double)  > 0.58) and is_ecom) as is_pharma,
from $input_extra as t
);

$input_classifiers_other = (
select t.*,
  (not(   (is_cehac   ?? False)
       or (is_diy     ?? False)
       or (is_fashion ?? False)
       or (is_home    ?? False)
       or (is_pharma  ?? False))
   and is_ecom
  ) as is_ecom_other
from $input_classifiers as t
);

$input_counts = (
select
  count(*)                 as count_total,
  count_if(not is_ecom)    as count_nonecom,
  count_if(is_ecom)        as count_ecom,
  count_if(is_cehac)       as count_cehac,
  count_if(is_diy)         as count_diy,
  count_if(is_fashion)     as count_fashion,
  count_if(is_home)        as count_home,
  count_if(is_pharma)      as count_pharma,
  count_if(is_ecom_other)  as count_otherecom,

  count_if(is_covered)                    as count_covered_total,
  count_if(is_covered and not is_ecom)    as count_covered_nonecom,
  count_if(is_covered and is_ecom)        as count_covered_ecom,
  count_if(is_covered and is_cehac)       as count_covered_cehac,
  count_if(is_covered and is_diy)         as count_covered_diy,
  count_if(is_covered and is_fashion)     as count_covered_fashion,
  count_if(is_covered and is_home)        as count_covered_home,
  count_if(is_covered and is_pharma)      as count_covered_pharma,
  count_if(is_covered and is_ecom_other)  as count_covered_otherecom,
from $input_classifiers_other
);

$input_coverage = (
select t.*,
  (CAST(count_covered_total      as double) / $max(count_total, 1))      as coverage_total,
  (CAST(count_covered_nonecom    as double) / $max(count_nonecom, 1))    as coverage_nonecom,
  (CAST(count_covered_ecom       as double) / $max(count_ecom, 1))       as coverage_ecom,
  (CAST(count_covered_cehac      as double) / $max(count_cehac, 1))      as coverage_cehac,
  (CAST(count_covered_diy        as double) / $max(count_diy, 1))        as coverage_diy,
  (CAST(count_covered_fashion    as double) / $max(count_fashion, 1))    as coverage_fashion,
  (CAST(count_covered_home       as double) / $max(count_home, 1))       as coverage_home,
  (CAST(count_covered_pharma     as double) / $max(count_pharma, 1))     as coverage_pharma,
  (CAST(count_covered_otherecom  as double) / $max(count_otherecom, 1))  as coverage_otherecom,
from $input_counts as t
);

insert into $output with truncate
select t.*, ($date || "T23:59:59") as `timestamp`
from $input_coverage as t;

end define;

/* ======================================================================== */

define subquery $is_existing_path($path, $folder) as
    $path = IF(String::StartsWith($path, '//'), substring($path, 2, length($path)), $path);
    $existing_dates = (select AGGREGATE_LIST(Path) from FOLDER($folder));
    select ListAny(ListMap($existing_dates, ($x) -> {RETURN String::EndsWith($x, Unwrap($path))})) as is_existing_path;
end define;

/* ======================================================================== */

$output_cache_us_exists = select is_existing_path from $is_existing_path($output_cache_us, $outputdir_cache);
evaluate if (not $disable_if_no_parsed_us or $output_cache_us_exists) do begin

evaluate if ($force_reparse_us or not $output_cache_us_exists)
  do $parse_us($input, $output_cache_us);
commit;

$output_cache_aggr_exists = select is_existing_path from $is_existing_path($output_cache_aggr, $outputdir_cache);
evaluate if ($force_redo_aggregate or not $output_cache_aggr_exists)
  do $aggregate_statistics($output_cache_us, $output_cache_aggr);
commit;

insert into $output_merged
select * from $output_cache_aggr;
commit;

evaluate if (not $disable_form_merged_sorted) do begin
  insert into $output_merged_sorted with truncate
  select * from $output_merged
  order by `timestamp` asc
end do;

end do;
