PRAGMA yt.InferSchema = '1';
PRAGMA yt.PoolTrees             = "physical";
PRAGMA yt.DefaultOperationWeight = '100.0';
PRAGMA yt.TentativePoolTrees    = "cloud";
PRAGMA yt.OptimizeFor           = "scan";
PRAGMA AnsiInForEmptyOrNullableItemsCollections;
PRAGMA yson.AutoConvert; 
PRAGMA yt.MaxJobCount = "100000";
PRAGMA SimpleColumns;

use hahn;

DECLARE $param_dict AS Dict<String, String>;

$base_dir = IF(
  $param_dict["branch"] = "prod",
  "//home/vipplanners/sow",
  "//home/vipplanners/sow_dev" || '/' || $param_dict["branch"]
);

------------------- input

-- источники Саши Литовченко:
-- лог со структурой компаний и информацией о дочерних компаниях
$holdings = '//home/comdep-analytics/zedlaa/spark/raw/company_spark_risks_report_xml';
-- табличка со статусами активности компаний
$spark_raw_all = '//home/comdep-analytics/zedlaa/spark/raw/all_2';
-- схематизированные данные спарка
$spark_schematized = '//home/comdep-analytics/zedlaa/spark/all_2';


-- внутренние источники из предыдущих шагов
-- все варианты пересечений с client_id -> spark_id
$intersections = $base_dir || '/' || 'dict/intersection/latest';
$clientid_to_sparkid = $base_dir || '/' || 'dict/curr_client_spark_id/wscore_adj.v12';

-- выручка компаний
$company_turnover = $base_dir || '/' || 'money/latest';
$clusters = $base_dir || '/' || 'dict/cluster/final_table';

-- результаты парсинга МСФО
$msfo = '//home/vipplanners/sow/dict/msfo';

-- внутренние таблицы (НЕ спарк):
$billing = '//home/comdep-analytics/billing/sales_daily_full';
$tiers = '//home/comdep-analytics/public/client_tiers/fact/latest_v2';
$orderstat_log = '//home/yabs/stat/OrderStatDay';
$orderinfo = '//home/yabs/dict/path_to_dict_order_info_table';


------------------- output
$output_path = $base_dir || '/' || 'meat/latest';


-------------
-- переменные, которые ещё будем донастраивать
$working_ltds_codes = ("24", "1333");
$start_date = "2019-01-01";
$day_after_end_date = "2020-01-01";


-------------
-- забираем все работающие компании:
$working_ltds = (
select 
    distinct SparkID, FullNameRus, is_firm
from
    $spark_raw_all
-- вешаем фильтр, отбирая только активные бизнесы
where
Yson::ConvertToString(
    Yson::YPath(
        Yson::Parse(Status), "/@Code"
        )
    ) in $working_ltds_codes
);

-- получили максимальную СПАРК-сущность для client_id
$get_max_spark_id_by_clientid = (
select
    client_id, 
    spark_id as max_spark_id
from
    $clientid_to_sparkid
);


-- получаем компании-родителей
$holdings_cooked = (
select 
    distinct spark_id, head_spark_id, head_name
from
    (
    select
        FullName,
        CAST(SparkID as int64) as spark_id,
        Yson::ConvertToString(DictLookup(Yson::ConvertToDict(Yson::Parse(_other["HeadOfCompany"])), "Name")) as head_name,
        Cast(Yson::ConvertToString(DictLookup(Yson::ConvertToDict(Yson::Parse(_other["HeadOfCompany"])), "SparkID")) as Int64) as head_spark_id
    from
       $holdings
    )
where
    head_spark_id is not null
);


-- считаем выручку самих компаний:
$turnover_cooked = (
SELECT 
    spark_id, 
    sum_if(nvl(cast(value as float),0), name ilike "%Выручка%") as revenue,
    sum_if(nvl(cast(value as float),0), name ilike "%Коммерческие расходы%") as commercial_expenses
from 
    $company_turnover
WHERE date_begin ilike "%2019%"
group by
    spark_id
);

-- считаем деньги:
-- забираем данные по деньгам в 2 сущностях: актах и открутках
$acts = 
(SELECT
    client_id,
    SUM(NVL(AMT_RUR,0)) as acts,
    SUM_IF(NVL(AMT_RUR,0), ACTIVITY_TYPE_1_ID = 17) as mediabanners_acts
FROM
    $billing as a
INNER JOIN
    $tiers as b
    ON a.ORDER_CLIENT_ID = b.client_id

WHERE
    ACTIVITY_TYPE_1_NAME IN ('Справочники - Контексты','Медиа - Баннеры','Технологии') -- забрали нужные форматы рекламы
    AND ACTIVITY_TYPE_2_NAME NOT IN ('Taxi', 'Special projects Market', 'Spravochnik') -- убрали ненужные
    AND ORDER_CLIENT_ID NOT IN (1370251, 2052443) -- убрали ico (по сути, внутренние расчёты с европейским юр.лицом
    AND DT_YEAR IS NOT NULL 
    AND DT_MONTH IS NOT NULL
    AND NVL(CAST(a.DT_YEAR as Int64),0) = 2019

GROUP BY
    a.ORDER_CLIENT_ID as client_id
    --b.curr_counterparty_name as curr_counterparty_name
    --b.curr_counterparty_reporting_tier.name as curr_client_tier,
    --CAST($beatify_date(a.DT_YEAR, a.DT_MONTH) as Int64) as month,
    --CAST(a.DT_YEAR as Int64) as year
);

$format = DateTime::Format("%Y-%m-%d");
$orderstat = (
SELECT 
    client_id,
    SUM(CAST(NVL(a.Cost,0) as Double)/1000000*30/1.18) as sum_rub_wo_nds,
FROM
    $orderstat_log as a 

INNER JOIN 
    (
    SELECT 
        DISTINCT OrderID, ClientID
    FROM
        -- `//home/direct/db/campaigns`. Саша Дуплищев рекомендует использовать OrderInfo вместо db/campaings, как более актуальную табличку здесь: https://st.yandex-team.ru/ADVERTANALYTICS-5673
        $orderinfo
    WHERE
        EngineID IN (7,77) --7 это директ, 77 - мкб, 67 - я.агентство, 67 - внутренняя реклама
    ) as b
    ON a.OrderID = b.OrderID

WHERE
    NVL(a.Shows,0) > 0
    AND $format(AddTimezone(DateTime::FromSeconds(CAST(a.UpdateTime As Uint32)), "Europe/Moscow")) >= $start_date
    AND $format(AddTimezone(DateTime::FromSeconds(CAST(a.UpdateTime As Uint32)), "Europe/Moscow")) < $day_after_end_date
    
GROUP BY
    b.ClientID as client_id
);

-- объединяем оба источника:
$money_joined =  (
select
    client_id, 
    sum(nvl(orderstat,0)) as orderstat, 
    sum(nvl(acts,0)) as acts, 
    sum(nvl(mediabanners_acts,0)) as mediabanners_acts 
from
    (
    select
        client_id, sum_rub_wo_nds as orderstat,  0 as acts, 0 as mediabanners_acts
    from 
        $orderstat
    
    union all
    
    select
        client_id, 0 as orderstat, acts, mediabanners_acts 
    from 
        $acts
    )
group by
    client_id
);


-- отдельно готовим данные по компании. На случай дублей - сразу выбираем some()
$company_details = (
select distinct
    spark_id,
    some(main_okved2_name) as main_okved,
    some(workers_range) as workers_range,
    some(company_size.description) as company_size,
    some(inn) as inn,
    some(ogrn) as ogrn,
    some(legal_city) as legal_city
    
from
    $spark_schematized
group by
    spark_id
);

-- готовим табличку с соответствиями по парам client_id -> spark_id
$match_info = (
select
    client_id,spark_id, 
    AGGREGATE_LIST_DISTINCT(AsTuple(criterion, value)) as match_info
from
    (
    select 
        distinct client_id,spark_id,criterion,value
    from    
        $intersections 
    )
group by
    client_id,spark_id
);


-- собираем единую выгрузку с максимальной детализацией
$result = (
select
    cast(a.SparkID as int64) as spark_id, FullNameRus as name, is_firm,
    
    -- добавили уровень ClientID: b
    b.client_id as client_id,
    
    -- добавили компании-родителей: с
    head_spark_id, head_name,
    
    -- показываем себя, если родитель не задан (потом будет удобно агрегировать).
    nvl(cast(head_spark_id as int64), cast(a.SparkID as int64)) as own_or_head_park_id,
    nvl(head_name, FullNameRus) as own_or_head_name,
    
    -- добавили данные о расходах в Яндексе: d
    orderstat, acts, mediabanners_acts,
    
    -- добавили данные по выручке компаний: e
    revenue,
    commercial_expenses,
    
    -- детали компании: f
    main_okved, 
    workers_range, 
    company_size,
    inn,
    ogrn,
    legal_city,
    
    -- данные по клиентам
    client_type,
    curr_counterparty_reporting_tier.name AS curr_client_tier,
    curr_counterparty_name,
    curr_counterparty_id,
    curr_client_main_domain,
    curr_counterparty_sales_manager.name AS curr_manager_name,
    
    -- h - параметры кластеризации
    McKinsey_ads_to_revenue,
    McKinsey_industry,
    cluster,
    Yandex_industry,
    
    nvl(e.revenue,0) * nvl(h.McKinsey_ads_to_revenue) as wallet_size_by_revenue,
    
    IF(
        nvl(e.revenue,0) * nvl(h.McKinsey_ads_to_revenue) = 0,
        null,
        cast(nvl(d.acts,0) as double) / ( nvl(e.revenue,0) * nvl(h.McKinsey_ads_to_revenue) ) 
    ) as sow_by_revenue,
    
    IF(
        nvl(e.commercial_expenses,0) = 0,
        null,
        cast(nvl(d.acts,0) as double) / nvl(e.commercial_expenses,0) 
    ) as sow_by_commercial_expenses,
    
    
    -- i: информация о связках
    match_info,
    
    --j: рекламные бюджеты из МСФО
    msfo_ad_budget
    
from
    $working_ltds as a
    
left join
    $get_max_spark_id_by_clientid as b 
    on cast(a.SparkID as int64) = b.max_spark_id
    
--добавили родителей
left join
    $holdings_cooked as c
    on cast(a.SparkID as int64) = cast(c.spark_id as int64)

-- добавили наши деньги
left join
    $money_joined as d
    on b.client_id = d.client_id
    
-- добавили данные о выручке:
left join
    $turnover_cooked as e
    on cast(a.SparkID as int64) = cast(e.spark_id as int64)
    
left join   
    $company_details as f
    on cast(a.SparkID as int64) = cast(f.spark_id as int64)

left join
    $tiers as g
    on b.client_id = g.client_id

left join
    $clusters as h
    on cast(a.SparkID as int64) = h.Spark_id

left join
    $match_info as i
    on cast(a.SparkID as int64) = i.spark_id and b.client_id = i.client_id
    
left join
    $msfo as j
    on cast(a.SparkID as int64) = j.spark_id
);


$yandex_spend_by_spark_id = (
select
    spark_id, 
    sum(nvl(acts,0)) as sparkid_acts_total
from
    $result
group by
    spark_id 
);


insert into $output_path WITH TRUNCATE 
select
    a.spark_id as spark_id,
    name,
    is_firm,
    client_id,
    head_spark_id, head_name,
    own_or_head_park_id, own_or_head_name,
    orderstat, acts, mediabanners_acts,
    revenue,
    commercial_expenses,
    main_okved, workers_range, company_size, inn, ogrn, legal_city,

    client_type,
    curr_client_tier,
    curr_counterparty_name,
    curr_counterparty_id,
    curr_client_main_domain,
    curr_manager_name,
    
    McKinsey_ads_to_revenue,
    McKinsey_industry,
    cluster,
    Yandex_industry,
    
    nvl(msfo_ad_budget,wallet_size_by_revenue) as wallet_size_by_revenue,
    nvl(d.sparkid_acts_total,0) as sparkid_acts_total,
    
    IF(
        nvl(nvl(msfo_ad_budget,wallet_size_by_revenue),0) = 0,
        null,
        cast(nvl(d.sparkid_acts_total,0) as double) / nvl(nvl(msfo_ad_budget,wallet_size_by_revenue),0)
    ) as sow_by_revenue,
    
    IF(
        nvl(commercial_expenses,0) = 0,
        null,
        cast(nvl(d.sparkid_acts_total,0) as double) / nvl(commercial_expenses,0) 
    ) as sow_by_commercial_expenses,
    
    match_info,
    msfo_ad_budget

from
    $result as a
left join
    $yandex_spend_by_spark_id as d
    using(spark_id)
;
