PRAGMA AnsiInForEmptyOrNullableItemsCollections;
PRAGMA yson.AutoConvert; 
PRAGMA SimpleColumns;

use hahn;

DECLARE $param_dict AS Dict<String, String>;

$base_dir = IF(
  $param_dict["branch"] = "prod",
  "//home/vipplanners/sow",
  "//home/vipplanners/sow_dev" || '/' || $param_dict["branch"]
);

------------------- input

-- последняя версия клиентских контактных данных
$client_info = $base_dir || '/' || 'dict/client_info.v4';

-- справочник компаний со статусами активности
$spark_raw_all = '//home/comdep-analytics/zedlaa/spark/raw/all_2';

-- схематизированные данные спарка
$spark_schematized = '//home/comdep-analytics/zedlaa/spark/all_2';

-- таблица с дополнительными контактами
$spark_contacts_table = '//home/comdep-analytics/zedlaa/spark/contacts';

------------------ output

-- путь, куда будет сложена очередная версия привязок
$path = $base_dir || '/' || "dict/intersection/latest";

------------------- переменные, которые можем настраивать

-- коды работающих компаний
$working_ltds_codes = ("24", "1333");

-- задаём параметры для модельки с расчётом пересечения
$phone_score = 1;
$mail_score = 1;
$inn_score = 1;
$ogrn_score = 1;
$domain_score = 1;

$bad_domain = (
  "newreg.ru",
  "vestnikao.ru",
  "rostatus.ru",
  "ecki.ru",
  "nrcreg.ru",
  "1registrator.nnov.ru"
);
$bad_mails = (
  "1@mail.ru",
  "buh@xperta.ru",
  "otdelnalogov@mail.ru"
);
------------------- собираем СПАРК
-- отобрали работающие компании
$working_ltds = (
SELECT SparkID
FROM $spark_raw_all WITH InferSchema
-- вешаем фильтр, отбирая только активные бизнесы
WHERE
Yson::ConvertToString(
    Yson::YPath(
        Yson::Parse(Status), "/@Code"
        )
    ) in $working_ltds_codes
);

$spark = (
SELECT
  spark_id
, email
, domain
, phone_list
, inn
, ogrn
FROM $spark_schematized AS a
LEFT SEMI JOIN $working_ltds AS b
  ON a.spark_id = CAST(b.SparkID AS Int64)
);

-- добавляем получение контактных данных (дополнительный отдельный вызов)
$spark_contacts = (
SELECT
  spark_id
, inn
, domains.domain AS domain
FROM (
    SELECT
      spark_id
    , inn
    , domains
    FROM $spark_contacts_table
    FLATTEN LIST BY domains
    )
);

--делаем нормализацию, чтобы делать джойн на результат из https://st.yandex-team.ru/OPERANALYTICS-1252
PRAGMA File('libcrypta_identifier_udf.so', 'yt://hahn/home/crypta/public/udfs/libcrypta_identifier_udf.so');
PRAGMA Udf('libcrypta_identifier_udf.so');

$is_valid_inn = ($str_inn) -> {
  RETURN
    CASE
      WHEN $str_inn IS NULL THEN False
      WHEN $str_inn = '0000000000' THEN False
      WHEN length($str_inn) < 9 THEN False
    ELSE True END;
};

-- http://www.consultant.ru/cons/cgi/online.cgi?rnd=C4A70E496692A9AA3DD0F1281BD7118F&req=doc&base=LAW&n=311536&dst=100012&fld=134&stat=refcode%3D16876%3Bdstident%3D100012%3Bindex%3D0#25yk8t3tqrb
$is_valid_ogrn = ($str_ogrn) -> {
    RETURN
      CASE
        WHEN $str_ogrn IS NULL THEN False
        WHEN length($str_ogrn) < 13 THEN False
      ELSE True END;
};

$parse_inn = ($inn) -> {
    $str_inn = cast($inn as String);
    RETURN IF($is_valid_inn($str_inn), $str_inn);
};

$parse_ogrn = ($ogrn) -> {
    $str_ogrn = cast($ogrn as String);
    RETURN IF($is_valid_ogrn($str_ogrn), $str_ogrn);
};

-- прикручиваем нормализацию доменов:
$get_domain = ($url) -> {
    $url = IF(Unicode::IsUtf($url), String::Strip($url));
    $url_parsed = Url::Parse(Url::NormalizeWithDefaultHttpScheme($url));
    $url_decoded = Url::ForcePunycodeToHostName($url_parsed.Host);
    -- пока забирать будем хост as is, без преобразования в significantdomain
    -- $domain = Url::GetSignificantDomain($url_decoded);
    $domain = String::ToLower($url_decoded) ?? "fake.domain";
    $domain = Url::CutWWW($domain);
    RETURN $domain
};

$spark_normalized = (
-- вешаем фильтр только на уникальные значения, т.к. иногда в СПАРКе могут замножаться контактные данные
SELECT DISTINCT
  spark_id
, phone_is_valid
, phone
, email_is_valid
, email
, inn_is_valid
, inn
, ogrn_is_valid
, ogrn
, domain
FROM (
  SELECT
    spark_id
  , phone.IsValid AS phone_is_valid
  , phone.Normalize AS phone
  , email.IsValid AS email_is_valid
  , email.Normalize AS email
  , $is_valid_inn(inn) AS inn_is_valid
  , $parse_inn(inn) AS inn
  , $is_valid_ogrn(ogrn) AS ogrn_is_valid
  , $parse_ogrn(ogrn) AS ogrn
  , $get_domain(domain) AS domain
  FROM (
    SELECT 
      spark_id
    , Identifiers::Email(nvl(email, '')) AS email
    , domain
    , inn
    , ogrn
    , Identifiers::Phone(nvl(phone.code, '') || nvl(phone.number, '')) AS phone
    FROM (
      SELECT 
        spark_id
      , email
      , domain
      , inn
      , ogrn
      , phone
      FROM $spark
      FLATTEN LIST BY phone_list AS phone
    )
  )
));

$spark_contacts_normalized = (
SELECT DISTINCT
  spark_id
, inn_is_valid
, inn
, domain
FROM (
  SELECT
    spark_id
  , $parse_inn(inn) AS inn
  , $is_valid_inn(inn) AS inn_is_valid
  , domain
  FROM $spark_contacts
)
);

$phones = (
select distinct spark_id, phone
from $spark_normalized
where phone_is_valid
  and nvl(phone,"") != ""
);
$mails = (
select distinct spark_id, email
from $spark_normalized
where email_is_valid
  and nvl(email,"") != ""
  and email not in $bad_mails
);

$inn = (
select distinct spark_id, inn
from  (
    select spark_id, inn, inn_is_valid
    from $spark_normalized
      union all
    select spark_id, inn, inn_is_valid
    from $spark_contacts_normalized
)
where inn_is_valid
  and nvl(inn, '') != ''
);

$ogrn = (
select distinct spark_id, ogrn
from $spark_normalized
where ogrn_is_valid
  and nvl(ogrn, '') != ''
);

$domains = (
select distinct spark_id,domain 
from (
    select spark_id, $get_domain(domain) as domain
    from $spark_normalized
      union all
    select spark_id, $get_domain(domain) as domain
    from $spark_contacts_normalized
)
where nvl(domain,"") != ""
  and domain not in $bad_domain
);

-- заливаем очередной расчёт
insert into $path with truncate

select
    client_id,
    spark_id, 
    "phone_balance" as criterion, 
    phone as value, 
    $phone_score as score
    
from $phones as a 
inner join (select client_id,phone_last_balance from $client_info where phone_last_balance_is_valid) as b
    on a.phone = b.phone_last_balance

union all

select
    client_id,
    spark_id, 
    "phone_balance_person" as criterion, 
    phone as value, 
    $phone_score as score
    
from $phones as a 
inner join (select client_id,phone_top_balance_person from $client_info flatten list by phone_top_balance_person  where phone_top_balance_person_has_valid ) as b
    on a.phone = b.phone_top_balance_person

union all

select
    client_id,
    spark_id, 
    "phone_crm" as criterion, 
    phone as value, 
    $phone_score as score
    
from $phones as a 
inner join (select client_id,phone_top_crm from $client_info flatten list by phone_top_crm  where phone_top_crm_has_valid) as b
    on a.phone = b.phone_top_crm

union all

select
    client_id,
    spark_id, 
    "phone_direct" as criterion, 
    phone as value, 
    $phone_score as score
    
from $phones as a 
inner join (select client_id,phone_last_direct from $client_info where phone_last_direct_is_valid) as b
    on a.phone = b.phone_last_direct

union all

select
    client_id,
    spark_id, 
    "inn_balance" as criterion, 
    inn as value, 
    $inn_score as score
from $inn as a 
inner join (
  select client_id, inn_top_balance_person
  from $client_info
  flatten list by inn_top_balance_person
  where inn_top_balance_person_has_valid
) as b
  on a.inn = b.inn_top_balance_person

union all

select
    client_id,
    spark_id, 
    "inn_sprav_company" as criterion, 
    inn as value, 
    $inn_score as score
from $inn as a 
join (
  select client_id, inn_sprav_company_max_cnt
  from $client_info
  where inn_sprav_company_max_cnt_is_valid
) as b
  on a.inn = b.inn_sprav_company_max_cnt

union all

select
    client_id,
    spark_id, 
    "inn_sprav_banner" as criterion, 
    inn as value, 
    $inn_score as score
from $inn as a 
join (
  select client_id, inn_sprav_banner_max_cnt
  from $client_info
  where inn_sprav_banner_max_cnt_is_valid
) as b
  on a.inn = b.inn_sprav_banner_max_cnt

union all

select
    client_id,
    spark_id, 
    "ogrn_sprav_company" as criterion, 
    ogrn as value, 
    $ogrn_score as score
from $ogrn as a 
join (
  select client_id, ogrn_sprav_company_max_cnt
  from $client_info
  where ogrn_sprav_company_max_cnt_is_valid
) as b
  on a.ogrn = b.ogrn_sprav_company_max_cnt

union all

select
    client_id,
    spark_id, 
    "ogrn_sprav_banner" as criterion, 
    ogrn as value, 
    $ogrn_score as score
from $ogrn as a 
join (
  select client_id, ogrn_sprav_banner_max_cnt
  from $client_info
  where ogrn_sprav_banner_max_cnt_is_valid
) as b
  on a.ogrn = b.ogrn_sprav_banner_max_cnt

union all

select
    client_id,
    spark_id, 
    "domain" as criterion, 
    domain as value, 
    $domain_score as score
    
from $domains as a 
inner join (select client_id,curr_client_main_domain from $client_info where nvl(curr_client_main_domain,"") !="" ) as b
    on a.domain = $get_domain(b.curr_client_main_domain) 
    
union all

select
    client_id,
    spark_id, 
    "domain_3y" as criterion, 
    domain as value, 
    $domain_score as score
    
from $domains as a 
inner join (select client_id,roll3y_client_main_domain from $client_info where nvl(roll3y_client_main_domain,"") !="" ) as b
    on a.domain = $get_domain(b.roll3y_client_main_domain) 
    
union all

select
    client_id,
    spark_id, 
    "email_balance" as criterion, 
    email as value, 
    $mail_score as score
    
from $mails as a 
inner join (select client_id,email_last_balance from $client_info where email_last_balance_is_valid) as b
    on a.email = b.email_last_balance
    
union all

select
    client_id,
    spark_id, 
    "email_balance_person" as criterion, 
    email as value, 
    $mail_score as score
    
from $mails as a 
inner join (select client_id,email_top_balance_person from $client_info flatten list by email_top_balance_person where email_top_balance_person_has_valid) as b
    on a.email = b.email_top_balance_person

union all

select
    client_id,
    spark_id, 
    "email_crm" as criterion, 
    email as value, 
    $mail_score as score
    
from $mails as a 
inner join (select client_id,email_top_crm from $client_info flatten list by email_top_crm where email_top_crm_has_valid) as b
    on a.email = b.email_top_crm

union all

select
    client_id,
    spark_id, 
    "email_direct" as criterion, 
    email as value, 
    $mail_score as score
    
from $mails as a 
inner join (select client_id,email_last_direct from $client_info where email_last_direct_is_valid) as b
    on a.email = b.email_last_direct

union all

select
    client_id,
    spark_id, 
    "email_vcard" as criterion, 
    email as value, 
    $mail_score as score
    
from $mails as a 
inner join (select client_id,email_top_vcard from $client_info flatten list by email_top_vcard where email_top_vcard_has_valid) as b
    on a.email = b.email_top_vcard

union all    

select
    client_id,
    spark_id, 
    "phone_crm" as criterion, 
    phone as value, 
    $phone_score as score
    
from $phones as a 
inner join (select client_id,phone_top_crm from $client_info flatten list by phone_top_crm  where phone_top_crm_has_valid) as b
    on a.phone = b.phone_top_crm
    

union all    

select
    client_id,
    spark_id, 
    "phone_vcard" as criterion, 
    phone as value, 
    $phone_score as score
    
from $phones as a 
inner join (select client_id,phone_top_vcard from $client_info flatten list by phone_top_vcard where phone_top_vcard_has_valid) as b
    on a.phone = b.phone_top_vcard

union all    

select
    client_id,
    spark_id, 
    "phone_sprav" as criterion, 
    phone as value, 
    $phone_score as score
    
from $phones as a 
inner join (select client_id,phone_top_sprav from $client_info flatten list by phone_top_sprav where phone_top_sprav_has_valid ) as b
    on a.phone = b.phone_top_sprav
;