USE hahn;
PRAGMA yt.StaticPool = "goods_quality";
PRAGMA yt.InferSchema = '10';
PRAGMA SimpleColumns;

/* ================================================================================ */

$gzt = 'v2_0_0317_0430';

--$dict = 'confidence70_lcabest';
--$dict = 'confidence70_lcabest_evalremovestopwords';
$dict = 'confidence70_lcabest_nostopwords';

--$basket = '2022-05-02_2022-05-08';
--$basket = '2022-07-06_2022-07-06_emptyqueries';
--$basket = '2022-07-06_2022-07-06';
$basket = '2022-06-27_2022-07-03';

$gztdir = '//home/goods_quality/topilskiy-ak/catfilter/collect_gzt/gzt/' || $gzt;
$evaldir = $gztdir || '/eval/' || $basket || '/' || $dict || '/';

$basketdir = '//home/goods_quality/topilskiy-ak/catfilter/collect_gzt/basket/';
$classifiers = $basketdir || $basket;

$input_coverage = $evaldir || 'inference';
$outputdir = $evaldir;

/* ================================================================================ */

$output_coverage = $outputdir || 'coverage';
$output = $outputdir || 'coverage_counts';

$input = (
select
  query,
  WeakField(cnt, Uint64, 1),
  WeakField(hid, Uint64, 0),
  WeakField(glfilter, String, ""),
from $input_coverage
);

$input_extra = (
select
  (     cast(wizdetection_ecom_classifier_prob as double) >= 0.2
    or  cast(query_about_many_products as double) >= 0.1
    or  cast(query_about_one_product as double) >= 0.05
  ) as is_ecom,
  a.*, b.* without b.query, a.cnt
from $input as a
left join
$classifiers as b
on a.query == b.query
);

$input_classifiers = (
select t.*,
  ((CAST(wizdetection_cehac_ecom_classifier_prob as double)   > 0.44) and is_ecom) as is_cehac,
  ((CAST(wizdetection_diy_ecom_classifier_prob as double)     > 0.40) and is_ecom) as is_diy,
  ((CAST(wizdetection_fashion_ecom_classifier_prob as double) > 0.40) and is_ecom) as is_fashion,
  ((CAST(wizdetection_home_ecom_classifier_prob as double)    > 0.30) and is_ecom) as is_home,
  ((CAST(wizdetection_pharma_ecom_classifier_prob as double)  > 0.58) and is_ecom) as is_pharma,
from $input_extra as t
);

$input_classifiers_other = (
select t.*,
  (not(   (is_cehac   ?? False)
       or (is_diy     ?? False)
       or (is_fashion ?? False)
       or (is_home    ?? False)
       or (is_pharma  ?? False))
   and is_ecom
  ) as is_other
from $input_classifiers as t
);

$hid_info = (
select
    hyper_id as hid,
    (Yson::GetLength(children) == 0) as is_leaf,
    (Yson::GetLength(hierarchy_hyper_ids)) as depth,
from `//home/market/production/mstat/dictionaries/categories/latest`
);

$input_classifiers_other_whidinfo = (
select
  a.*,
  b.is_leaf ?? False as is_leaf,
  b.depth ?? 1 as depth,
from $input_classifiers_other as a
left join $hid_info as b
on a.hid == b.hid
);


$input_joined = $input_classifiers_other_whidinfo;

/* ---------------------------- */

$max = ($a, $b) -> {
	return IF($a > $b, $a, $b);
};

$input_classifiers_other_whidinfo_for_coverage = (
select
    a.*,
    is_other as is_ecom_other,
    (hid != 0 or glfilter != "") as is_covered,
from $input_classifiers_other_whidinfo as a
);

$input_counts = (
select
  (sum(cnt)                    ?? 0) as count_total,
  (sum_if(cnt, not is_ecom)    ?? 0) as count_nonecom,
  (sum_if(cnt, is_ecom)        ?? 0) as count_ecom,
  (sum_if(cnt, is_cehac)       ?? 0) as count_cehac,
  (sum_if(cnt, is_diy)         ?? 0) as count_diy,
  (sum_if(cnt, is_fashion)     ?? 0) as count_fashion,
  (sum_if(cnt, is_home)        ?? 0) as count_home,
  (sum_if(cnt, is_pharma)      ?? 0) as count_pharma,
  (sum_if(cnt, is_ecom_other)  ?? 0) as count_otherecom,

  (sum_if(cnt, is_covered)                    ?? 0) as count_covered_total,
  (sum_if(cnt, is_covered and not is_ecom)    ?? 0) as count_covered_nonecom,
  (sum_if(cnt, is_covered and is_ecom)        ?? 0) as count_covered_ecom,
  (sum_if(cnt, is_covered and is_cehac)       ?? 0) as count_covered_cehac,
  (sum_if(cnt, is_covered and is_diy)         ?? 0) as count_covered_diy,
  (sum_if(cnt, is_covered and is_fashion)     ?? 0) as count_covered_fashion,
  (sum_if(cnt, is_covered and is_home)        ?? 0) as count_covered_home,
  (sum_if(cnt, is_covered and is_pharma)      ?? 0) as count_covered_pharma,
  (sum_if(cnt, is_covered and is_ecom_other)  ?? 0) as count_covered_otherecom,
from $input_classifiers_other_whidinfo_for_coverage
);

$input_counts_aggregate = (
select t.*,
  (CAST(count_covered_total      as double) / $max(count_total, 1))      as coverage_total,
  (CAST(count_covered_nonecom    as double) / $max(count_nonecom, 1))    as coverage_nonecom,
  (CAST(count_covered_ecom       as double) / $max(count_ecom, 1))       as coverage_ecom,
  (CAST(count_covered_cehac      as double) / $max(count_cehac, 1))      as coverage_cehac,
  (CAST(count_covered_diy        as double) / $max(count_diy, 1))        as coverage_diy,
  (CAST(count_covered_fashion    as double) / $max(count_fashion, 1))    as coverage_fashion,
  (CAST(count_covered_home       as double) / $max(count_home, 1))       as coverage_home,
  (CAST(count_covered_pharma     as double) / $max(count_pharma, 1))     as coverage_pharma,
  (CAST(count_covered_otherecom  as double) / $max(count_otherecom, 1))  as coverage_otherecom,
from $input_counts as t
);

insert into $output_coverage with truncate
select t.*
from $input_counts_aggregate as t;

/* ---------------------------- */

insert into $output with truncate
select
    "total"  as `type`,

    count(*)             as cnt,
    count_if(is_cehac)   as cnt_cehac,
    count_if(is_diy)     as cnt_diy,
    count_if(is_fashion) as cnt_fashion,
    count_if(is_home)    as cnt_home,
    count_if(is_pharma)  as cnt_pharma,
    count_if(is_other)   as cnt_other,

    sum(cnt)                as sum_cnt,
    sum_if(cnt, is_cehac)   as sum_cnt_cehac,
    sum_if(cnt, is_diy)     as sum_cnt_diy,
    sum_if(cnt, is_fashion) as sum_cnt_fashion,
    sum_if(cnt, is_home)    as sum_cnt_home,
    sum_if(cnt, is_pharma)  as sum_cnt_pharma,
    sum_if(cnt, is_other)   as sum_cnt_other,

from $input_joined
; commit;

insert into $output
select
    "hid_worked"  as `type`,

    count(*)             as cnt,
    count_if(is_cehac)   as cnt_cehac,
    count_if(is_diy)     as cnt_diy,
    count_if(is_fashion) as cnt_fashion,
    count_if(is_home)    as cnt_home,
    count_if(is_pharma)  as cnt_pharma,
    count_if(is_other)   as cnt_other,

    sum(cnt)                as sum_cnt,
    sum_if(cnt, is_cehac)   as sum_cnt_cehac,
    sum_if(cnt, is_diy)     as sum_cnt_diy,
    sum_if(cnt, is_fashion) as sum_cnt_fashion,
    sum_if(cnt, is_home)    as sum_cnt_home,
    sum_if(cnt, is_pharma)  as sum_cnt_pharma,
    sum_if(cnt, is_other)   as sum_cnt_other,

from $input_joined
where hid != 0
; commit;

insert into $output
select
    "glfilter_worked"  as `type`,

    count(*)             as cnt,
    count_if(is_cehac)   as cnt_cehac,
    count_if(is_diy)     as cnt_diy,
    count_if(is_fashion) as cnt_fashion,
    count_if(is_home)    as cnt_home,
    count_if(is_pharma)  as cnt_pharma,
    count_if(is_other)   as cnt_other,

    sum(cnt)                as sum_cnt,
    sum_if(cnt, is_cehac)   as sum_cnt_cehac,
    sum_if(cnt, is_diy)     as sum_cnt_diy,
    sum_if(cnt, is_fashion) as sum_cnt_fashion,
    sum_if(cnt, is_home)    as sum_cnt_home,
    sum_if(cnt, is_pharma)  as sum_cnt_pharma,
    sum_if(cnt, is_other)   as sum_cnt_other,

from $input_joined
where glfilter != ""
; commit;

insert into $output
select
    "hid_worked_leaf"  as `type`,

    count(*)             as cnt,
    count_if(is_cehac)   as cnt_cehac,
    count_if(is_diy)     as cnt_diy,
    count_if(is_fashion) as cnt_fashion,
    count_if(is_home)    as cnt_home,
    count_if(is_pharma)  as cnt_pharma,
    count_if(is_other)   as cnt_other,

    sum(cnt)                as sum_cnt,
    sum_if(cnt, is_cehac)   as sum_cnt_cehac,
    sum_if(cnt, is_diy)     as sum_cnt_diy,
    sum_if(cnt, is_fashion) as sum_cnt_fashion,
    sum_if(cnt, is_home)    as sum_cnt_home,
    sum_if(cnt, is_pharma)  as sum_cnt_pharma,
    sum_if(cnt, is_other)   as sum_cnt_other,

from $input_joined
where hid != 0 and is_leaf
; commit;

insert into $output
select
    "hid_worked_nonleaf"  as `type`,

    count(*)             as cnt,
    count_if(is_cehac)   as cnt_cehac,
    count_if(is_diy)     as cnt_diy,
    count_if(is_fashion) as cnt_fashion,
    count_if(is_home)    as cnt_home,
    count_if(is_pharma)  as cnt_pharma,
    count_if(is_other)   as cnt_other,

    sum(cnt)                as sum_cnt,
    sum_if(cnt, is_cehac)   as sum_cnt_cehac,
    sum_if(cnt, is_diy)     as sum_cnt_diy,
    sum_if(cnt, is_fashion) as sum_cnt_fashion,
    sum_if(cnt, is_home)    as sum_cnt_home,
    sum_if(cnt, is_pharma)  as sum_cnt_pharma,
    sum_if(cnt, is_other)   as sum_cnt_other,

from $input_joined
where hid != 0 and not is_leaf
; commit;

insert into $output
select
    "hid_worked_depth"  as `type`,

    sum(1 * depth)                ?? 0 as cnt,
    sum_if(1 * depth, is_cehac)   ?? 0 as cnt_cehac,
    sum_if(1 * depth, is_diy)     ?? 0 as cnt_diy,
    sum_if(1 * depth, is_fashion) ?? 0 as cnt_fashion,
    sum_if(1 * depth, is_home)    ?? 0 as cnt_home,
    sum_if(1 * depth, is_pharma)  ?? 0 as cnt_pharma,
    sum_if(1 * depth, is_other)   ?? 0 as cnt_other,

    sum(cnt * depth)                as sum_cnt,
    sum_if(cnt * depth, is_cehac)   as sum_cnt_cehac,
    sum_if(cnt * depth, is_diy)     as sum_cnt_diy,
    sum_if(cnt * depth, is_fashion) as sum_cnt_fashion,
    sum_if(cnt * depth, is_home)    as sum_cnt_home,
    sum_if(cnt * depth, is_pharma)  as sum_cnt_pharma,
    sum_if(cnt * depth, is_other)   as sum_cnt_other,

from $input_joined
where hid != 0
; commit;


$raw_top10k = select * from $input_joined order by cnt desc limit 10000;
$cnt_top10k = select min(cnt) from $raw_top10k;
$top10k = select * from $input_joined where cnt >= $cnt_top10k;

insert into $output
select
    "top10k_total"  as `type`,

    count(*)             as cnt,
    count_if(is_cehac)   as cnt_cehac,
    count_if(is_diy)     as cnt_diy,
    count_if(is_fashion) as cnt_fashion,
    count_if(is_home)    as cnt_home,
    count_if(is_pharma)  as cnt_pharma,
    count_if(is_other)   as cnt_other,

    sum(cnt)                as sum_cnt,
    sum_if(cnt, is_cehac)   as sum_cnt_cehac,
    sum_if(cnt, is_diy)     as sum_cnt_diy,
    sum_if(cnt, is_fashion) as sum_cnt_fashion,
    sum_if(cnt, is_home)    as sum_cnt_home,
    sum_if(cnt, is_pharma)  as sum_cnt_pharma,
    sum_if(cnt, is_other)   as sum_cnt_other,

from $top10k
; commit;


insert into $output
select
    "top10k_hid_worked"  as `type`,

    count(*)             as cnt,
    count_if(is_cehac)   as cnt_cehac,
    count_if(is_diy)     as cnt_diy,
    count_if(is_fashion) as cnt_fashion,
    count_if(is_home)    as cnt_home,
    count_if(is_pharma)  as cnt_pharma,
    count_if(is_other)   as cnt_other,

    sum(cnt)                as sum_cnt,
    sum_if(cnt, is_cehac)   as sum_cnt_cehac,
    sum_if(cnt, is_diy)     as sum_cnt_diy,
    sum_if(cnt, is_fashion) as sum_cnt_fashion,
    sum_if(cnt, is_home)    as sum_cnt_home,
    sum_if(cnt, is_pharma)  as sum_cnt_pharma,
    sum_if(cnt, is_other)   as sum_cnt_other,

from $top10k
where hid != 0
; commit;

insert into $output
select
    "top10k_glfilter_worked"  as `type`,

    count(*)             as cnt,
    count_if(is_cehac)   as cnt_cehac,
    count_if(is_diy)     as cnt_diy,
    count_if(is_fashion) as cnt_fashion,
    count_if(is_home)    as cnt_home,
    count_if(is_pharma)  as cnt_pharma,
    count_if(is_other)   as cnt_other,

    sum(cnt)                as sum_cnt,
    sum_if(cnt, is_cehac)   as sum_cnt_cehac,
    sum_if(cnt, is_diy)     as sum_cnt_diy,
    sum_if(cnt, is_fashion) as sum_cnt_fashion,
    sum_if(cnt, is_home)    as sum_cnt_home,
    sum_if(cnt, is_pharma)  as sum_cnt_pharma,
    sum_if(cnt, is_other)   as sum_cnt_other,

from $top10k
where glfilter != ""
; commit;
