USE hahn;
PRAGMA yt.MaxRowWeight = "32M";

DECLARE $param_dict AS Dict<String, String>;

$base_dir = IF(
  $param_dict["branch"] = "prod",
  "//home/vipplanners/sow",
  "//home/vipplanners/sow_dev" || '/' || $param_dict["branch"]
);

-- input
PRAGMA file('good_wallet.cbm', 'yt://hahn/home/vipplanners/sow/models/good_wallet.cbm');

$spark_schematized_path = "//home/comdep-analytics/april/spark/ActualSparkData";
-- таблица с дополнительными контактами
$spark_contacts_table = '//home/comdep-analytics/zedlaa/spark/contacts';

-- output
$good_wallet_size = $base_dir || '/' || 'dict/good_wallet/20201228';

/* использованные факторы
    "sp_1_revenue",
    "sp_3_insurance_bonus",
    "sp_company_type",
    "sp_inn_region",
    "sp_has_domain",
    "sp_main_okved_rnd",
    "sp_main_okved_l1",
    "sp_main_okved_l2",
*/

$spark_schematized = (
SELECT *
FROM (
  SELECT MAX_BY(TableRow(), last_update_date)
  FROM $spark_schematized_path
  WHERE spark_id > 0
    AND is_firm
    AND company_type IN (1, 3)
  GROUP BY spark_id
)
FLATTEN COLUMNS
);

-- finance features

$get_one_element = ($p_str_yson) -> {
  $v_date_begin = NVL($p_str_yson.date_begin, "");
  $v_date_end = NVL($p_str_yson.date_end, "");
  $v_period_name = NVL($p_str_yson.period_name, "");
  RETURN ListMap(
    $p_str_yson.string_list,
    ($x) -> {
      RETURN 
        <|
          date_begin: $v_date_begin,
          date_end: $v_date_end,
          period_name: $v_period_name,
          name: NVL($x.name, ""),
          value: NVL($x.value, 0)
        |>
    }
  )
};

$get_finance = ($p_finance) -> {
  RETURN ListMap($p_finance,($x) -> {return $get_one_element ($x)})
};


$spark_finance = (
SELECT DISTINCT
  spark_id AS spark_id
, finance_temp2.date_begin AS date_begin
, finance_temp2.date_end AS date_end
, finance_temp2.period_name AS period_name
, finance_temp2.name AS name
, finance_temp2.value AS value
FROM (
  SELECT
    spark_id
  , finance_temp1
  FROM $spark_schematized
  FLATTEN LIST BY ($get_finance(finance) AS finance_temp1)
  WHERE spark_id > 0
)
FLATTEN LIST BY finance_temp1 As finance_temp2
);

$spark_finance_features = (
SELECT
  spark_id
-- обычная компания company_type == 1
, CAST(SUM(IF(name == "Выручка", value)) AS Float) ?? 0 AS sp_1_revenue
-- страховые компании company_type == 3
, CAST(SUM(IF(name == "Заработанные страховые премии – нетто-перестрахование", value)) AS Float) ?? 0 AS sp_3_insurance_bonus
FROM (
  SELECT MAX_BY(TableRow(), CAST(period_name AS Int64))
  FROM $spark_finance
  WHERE CAST(period_name AS Int64) = 2019
  GROUP BY spark_id, name
) FLATTEN COLUMNS
GROUP BY spark_id
);

-- cat features

$bad_domain = (
  "newreg.ru",
  "vestnikao.ru",
  "rostatus.ru",
  "ecki.ru",
  "nrcreg.ru",
  "1registrator.nnov.ru"
);

$spark = (
SELECT
  spark_id
, domain
FROM $spark_schematized AS a
WHERE spark_id > 0
);
-- добавляем получение контактных данных (дополнительный отдельный вызов)
$spark_contacts = (
SELECT
  spark_id
, domain.domain AS domain
FROM $spark_contacts_table
FLATTEN LIST BY domains AS domain
);

$get_domain = ($url) -> {
    $url = IF(Unicode::IsUtf($url), String::Strip($url));
    $url_parsed = Url::Parse(Url::NormalizeWithDefaultHttpScheme($url));
    $url_decoded = Url::ForcePunycodeToHostName($url_parsed.Host);
    -- пока забирать будем хост as is, без преобразования в significantdomain
    -- $domain = Url::GetSignificantDomain($url_decoded);
    $domain = String::ToLower($url_decoded) ?? "fake.domain";
    $domain = Url::CutWWW($domain);
    RETURN $domain
};
$spark_normalized = (
-- вешаем фильтр только на уникальные значения, т.к. иногда в СПАРКе могут замножаться контактные данные
SELECT
  spark_id
, domain
FROM $spark
GROUP BY
  spark_id
, $get_domain(domain) AS domain
);
$spark_contacts_normalized = (
SELECT DISTINCT
  spark_id
, domain
FROM $spark_contacts
);

$domains = (
SELECT
  spark_id
, IF(SOME(domain) != "fake.domain", "1", "0") AS has_domain
FROM (
    SELECT spark_id, $get_domain(domain) AS domain
    FROM $spark_normalized
      UNION ALL
    SELECT spark_id, $get_domain(domain) AS domain
    FROM $spark_contacts_normalized
)
WHERE nvl(domain, "") != ""
  AND domain NOT IN $bad_domain
GROUP BY spark_id
);

$sp_inn_region = ($x) -> {
  RETURN NVL(SUBSTRING($x, NULL , 2), '-1');
};

$sp_main_okved_rnd = ($x) -> {
  RETURN NVL(String::JoinFromList(ListTake(String::SplitToList($x, '.'), 2), '.'), '-1');
};

$sp_main_okved_l1 = ($x) -> {
  $xs = String::SplitToList($x, '.');
  RETURN NVL($xs[0], '-1');
};
$sp_main_okved_l2 = ($x) -> {
  $xs = String::SplitToList($x, '.');
  RETURN NVL($xs[1], '-1');
};
$sp_company_type = ($x) -> {
  RETURN NVL(CAST($x AS String), '-1')
};
$sp_has_domain = ($x) -> {
  RETURN NVL($x, '0');
};

$spark_cat_features = (
SELECT
  s.spark_id AS spark_id
, $sp_main_okved_rnd(s.main_okved2_code) AS sp_main_okved_rnd
, $sp_main_okved_l1(s.main_okved2_code) AS sp_main_okved_l1
, $sp_main_okved_l2(s.main_okved2_code) AS sp_main_okved_l2
, $sp_company_type(s.company_type) AS sp_company_type
, $sp_inn_region(s.inn) AS sp_inn_region
, $sp_has_domain(d.has_domain) AS sp_has_domain
FROM $spark_schematized AS s
LEFT JOIN $domains AS d
  ON s.spark_id = d.spark_id
WHERE s.spark_id > 0
);


-- join feature

$good_wallet_eval = CatBoost::LoadModel(FilePath('good_wallet.cbm'));

$features_to_process = (
SELECT
  [
      NVL(f.sp_1_revenue, 0)
    , NVL(f.sp_3_insurance_bonus, 0)
  ] AS FloatFeatures
, [
      NVL(c.sp_company_type, '-1')
    , NVL(c.sp_inn_region, '-1')
    , NVL(c.sp_has_domain, '0')
    , NVL(c.sp_main_okved_rnd, '-1')
    , NVL(c.sp_main_okved_l1, '-1')
    , NVL(c.sp_main_okved_l2, '-1')
  ] AS CatFeatures
, NVL(f.spark_id, c.spark_id) AS PassThrough
FROM $spark_cat_features AS c
FULL JOIN $spark_finance_features AS f
  ON c.spark_id = f.spark_id
);

$processed = (
    PROCESS $features_to_process
    USING CatBoost::EvaluateBatch($good_wallet_eval, TableRows())
);

INSERT INTO $good_wallet_size WITH TRUNCATE
SELECT
  p.*
, p.Result[0] AS pred_good_wallet
, p.PassThrough AS spark_id
WITHOUT
  PassThrough
, Result
FROM $processed AS p
ORDER BY spark_id
;
