/* Рекомендация про минус-фразы

Тикет: OKK-2935

## Данные для проработки
1. В какой кампании больше всего подозрительного
Хотим подсказать, с чего лучше начать работу
Список кампаний с суммой показов и нашим прогнозом по подозрительному объему
- Надо ли показывать примеры запросов?
- Сразу разрешаем добавлять новые фразы в список единых на кампанию минус-фраз?
- Как подсказывать какие конкретно мнус-фразы стоит добавить?

-> проваливаемся в кампанию
Показываем самые подозрительные тройки: баннер, фразы, запросы.
Отображаем статистику фактических показов за последние 30-ть дней.
По какому сегменту показывать топ: баннеры, фразы или запросы?

2. В какой группе больше всего подозрительного
...как с кампаниями
-> проваливаемся в группу
...как с кампаниями
*/

USE hahn;
PRAGMA yt.DataSizePerJob = "3G";
PRAGMA yt.DefaultMemoryLimit = "8G";

$drop_anti_words = ($x) -> {
  return String::SplitToList($x, " -", True)[0];
};
-- пороги релевантности и wCTR подобраны эмпирически
-- используется для вычисления exact_bad_shows
-- exact_bad_shows - единственный критерий значимости ошибки
$calc_dssm = ($row) -> {
  return IF(
          $row.DSSMScore < 0.07 -- подогнал количество под старый вариант
      AND ($row.Clicks / $row.eShows < 0.001)  -- wCTR
      AND $row.DirectBannerID > 0
      AND $row.Shows > 1
      AND $row.PhraseText > ""
      AND $row.QueryWordsCnt > $row.PhraseWordsCnt
    , $row.DSSMScore
    , 1.
);
};

-- В какой кампании больше всего подозрительного
$bad_cid_preagg = (
SELECT
  SOME(ClientID) AS client_id
, SOME(Login) AS login
, cid
, SOME(CampaignName) AS campaign_name
, query
, phrase
, title_db
, body_db
, SUM(Shows) AS shows
, SUM(eShows) AS eshows
, SUM(Clicks) AS clicks
, SUM(Shows - Shows * DSSMScore) AS pr_bad_shows
, SUM(Shows - Shows * $calc_dssm(TableRow())) AS exact_bad_shows
, SUM(Shows * DSSMScore) / SUM(Shows) AS DSSMScore
, SUM(Shows * DSSMScoreOld) / SUM(Shows) AS DSSMScoreOld
FROM `home/vipplanners/ta_report/30d/search_queries.data`
GROUP BY 
  CampaignID AS cid
, Query AS query
, $drop_anti_words(PhraseText) AS phrase
, TitleDB AS title_db
, BodyDB AS body_db
);

INSERT INTO @bad_cid WITH TRUNCATE 
SELECT 
  SOME(client_id) AS client_id
, SOME(login) AS login
, cid
, SOME(campaign_name) AS campaign_name
, SUM(shows) AS shows
, SUM(eshows) AS eshows
, SUM(clicks) AS clicks
, SUM(pr_bad_shows) AS pr_bad_shows
, SUM(exact_bad_shows) AS exact_bad_shows
, SUM(pr_bad_shows) / SUM(shows) AS pr_bad_shows_share
, SUM(exact_bad_shows) / SUM(shows) AS exact_bad_shows_share
, SUM(shows * DSSMScore) / SUM(shows) AS DSSMScore
, SUM(shows * DSSMScoreOld) / SUM(shows) AS DSSMScoreOld
, ListTake(
    ListSortDesc(
      AGGREGATE_LIST(
        IF(
          exact_bad_shows > 0,
          AsTuple(title_db, body_db, phrase, query, exact_bad_shows),
          Nothing(TypeOf(Just(AsTuple(title_db, body_db, phrase, query, exact_bad_shows))))
        )
      ), ($x)->{RETURN $x.4}
    ), 10
  ) AS top_examples
FROM $bad_cid_preagg
GROUP By cid
;

-- В какой группе больше всего подозрительного
$bad_pid_preagg = (
SELECT
  SOME(ClientID) AS client_id
, SOME(Login) AS login
, SOME(CampaignID) AS cid
, SOME(CampaignName) AS campaign_name
, pid
, query
, phrase
, title_db
, body_db
, SUM(Shows) AS shows
, SUM(eShows) AS eshows
, SUM(Clicks) AS clicks
, SUM(Shows - Shows * DSSMScore) AS pr_bad_shows
, SUM(Shows - Shows * $calc_dssm(TableRow())) AS exact_bad_shows
, SUM(Shows * DSSMScore) / SUM(Shows) AS DSSMScore
, SUM(Shows * DSSMScoreOld) / SUM(Shows) AS DSSMScoreOld
FROM `home/vipplanners/ta_report/30d/search_queries.data`
GROUP BY
  GroupID AS pid
, Query AS query
, $drop_anti_words(PhraseText) AS phrase
, TitleDB AS title_db
, BodyDB AS body_db
);

INSERT INTO @bad_pid WITH TRUNCATE 
SELECT 
  SOME(client_id) AS client_id
, SOME(login) AS login
, SOME(cid) AS cid
, SOME(campaign_name) AS campaign_name
, pid
, SUM(shows) AS shows
, SUM(eshows) AS eshows
, SUM(clicks) AS clicks
, SUM(pr_bad_shows) AS pr_bad_shows
, SUM(exact_bad_shows) AS exact_bad_shows
, SUM(pr_bad_shows) / SUM(shows) AS pr_bad_shows_share
, SUM(exact_bad_shows) / SUM(shows) AS exact_bad_shows_share
, SUM(shows * DSSMScore) / SUM(shows) AS DSSMScore
, SUM(shows * DSSMScoreOld) / SUM(shows) AS DSSMScoreOld
, ListTake(
    ListSortDesc(
      AGGREGATE_LIST(
        IF(
          exact_bad_shows > 0,
          AsTuple(title_db, body_db, phrase, query, exact_bad_shows),
          Nothing(TypeOf(Just(AsTuple(title_db, body_db, phrase, query, exact_bad_shows))))
        )
      ), ($x)->{RETURN $x.4}
    ), 10
  ) AS top_examples
FROM $bad_pid_preagg
GROUP By pid
; COMMIT;

INSERT INTO `//home/vipplanners/ta_report/recommendations/current` WITH TRUNCATE 
SELECT 
  client_id
, login
, cid
, campaign_name
, pid
, Cast(Null AS Int64) AS bid
, shows
, eshows
, clicks
, pr_bad_shows
, exact_bad_shows
, pr_bad_shows_share
, exact_bad_shows_share
, DSSMScore
, DSSMScoreOld
, top_examples
FROM (
  SELECT *
  FROM @bad_cid
  UNION ALL
  SELECT *
  FROM @bad_pid
)
WHERE exact_bad_shows > 0
ORDER BY
  client_id
, cid
, pid
, bid
;
