get_hosts = '''
PRAGMA yt.QueryCacheMode = 'normal';
PRAGMA yt.DefaultMemoryLimit = '3G';
PRAGMA yt.InferSchema;

$recall_parameter=0.95;

$script = @@
import json
from collections import defaultdict
from urlparse import urlsplit

class Node(object):
    def __init__(self):
        self.next_node = defaultdict(Node)
        self.count = 0
        self.track_count = 0

    def add(self, iterable):
        try:
            next_step = iterable.next()
            self.track_count += 1
            self.next_node[next_step].add(iterable)
        except StopIteration:
            self.count += 1

    def merge(self, other):
        self.count += other.count
        self.track_count += other.track_count
        if not self.next_node:
            self.next_node = other.next_node
        else:
            for char in other.next_node:
                self.next_node[char].merge(other.next_node[char])

    def find_cross(self, you):
        if len(self.next_node) == 1:
            return [you] + self.next_node.values()[0].find_cross(self.next_node.keys()[0])
        elif len(self.next_node) > 1:
            return [you]
        else:
            return []


    def get_results(self):
        top_steps = sorted( ((k, float(v.track_count) / self.track_count) for k, v in self.next_node.iteritems() if v.track_count != 0), key=lambda x: -x[1])
        iteresting_steps = list()
        sum_recall = 0
        for i, (step, recall) in enumerate(top_steps):
            if i > 100 or sum_recall > @@ || Cast($recall_parameter as String) || @@:
                break
            sum_recall += recall
            iteresting_steps.append(step)

        results = list()
        for k in iteresting_steps:
            result = self.next_node[k].find_cross(k)
            results.append((float(self.next_node[k].track_count) / self.track_count, result))
        return results, sum_recall

    def to_dict(self):
        return {{
            'count': self.count,
            'track_count': self.track_count,
            'next_node': {{k: v.to_dict() for k, v in self.next_node.iteritems()}}
        }}

    def from_dict(self, d):
        self.count = d['count']
        self.track_count = d['track_count']
        for k, v in d['next_node'].iteritems():
            self.next_node[k].from_dict(v)


class Bor(object):
    def __init__(self):
        self.head = Node()

    def add(self, path):
        path_parts = urlsplit(path).path.strip('/').split('/')
        self.head.add(iter(path_parts))

    def merge(self, other):
        self.head.merge(other.head)

    def get_results(self):
        results, sum_recall = self.head.get_results()
        return {{'path_list': [{{'recall': recall, 'path': '/' + '/'.join(parts)}} for recall, parts in results], 'recall':sum_recall}}


def create(item):
    b = Bor()
    b.add(item)
    return b


def add(bor, item):
    bor.add(item)
    return bor


def merge(bor_a, bor_b):
    if bor_a.head.track_count < bor_b.head.track_count:
        bor_a, bor_b = bor_b, bor_a
    bor_a.merge(bor_b)
    return bor_a


def get_result(bor):
    return bor.get_results()


def serialize(state):
    return json.dumps(state.head.to_dict())


def deserialize(serialized):
    b = Bor()
    b.head.from_dict(json.loads(serialized))
    return b

@@;

--------------------------------------------------------------------------------
-- Declare signatures of functions defined in the script above:
--------------------------------------------------------------------------------
$create = Python::create("(String)->Resource<Python2>", $script);
$add = Python::add("(Resource<Python2>,String)->Resource<Python2>", $script);
$merge = Python::merge("(Resource<Python2>,Resource<Python2>)->Resource<Python2>", $script);
$get_result = Python::get_result("(Resource<Python2>)->Struct<'recall':Double,'path_list':List<Struct<'recall':Double,'path':String>>>", $script);
$serialize = Python::serialize("(Resource<Python2>)->String", $script);
$deserialize = Python::deserialize("(String)->Resource<Python2>", $script);

$udaf = UDAF(
            DISTINCT path,
            $create,
            $add,
            $merge,
            $get_result,
            $serialize,
            $deserialize
);

$aggregation = (
    select host, aggregation.path_list as path_list, aggregation.recall as sum_recall, countries as countries
    from (
        select
            host as host,
            $udaf as aggregation,
            LIST(DISTINCT country) as countries
        from (
            select String::ToLower(Url::GetHost(url)) as host, Url::GetPath(url) as path, String::SplitToList(TablePath(), '/'){{6}} as country
            from Concat(
                [home/recommender/zen/content_features/kosher/byCountry/belarus/data/w2v],
                [home/recommender/zen/content_features/kosher/byCountry/russia/data/w2v],
                [home/recommender/zen/content_features/kosher/byCountry/ukraine/data/w2v],
                [home/recommender/zen/content_features/kosher/byCountry/kazakhstan/data/w2v]
            )
            where Length(url) > 0
        )
        group by host
    )
);

insert into [{output_table}]
WITH TRUNCATE
select
    host,
    case
        when sum_recall > $recall_parameter
        then String::JoinFromList(
            ListFlatMap(
                path_list,
                ($y) -> {{
                    RETURN String::ToLower($y.path);
                }}
            ),
            '\t'
        )
        else '/'
    end as path,
    String::JoinFromList(countries, '\t') as countries
from $aggregation
order by host;

insert into [{output_whitelist_table}]
WITH TRUNCATE
SELECT DISTINCT host
FROM (
    SELECT String::ToLower(Url::GetHost(key)) as host
    FROM CONCAT(
        [home/recommender/zen/bunker/domain_whitelist_kz],
        [home/recommender/zen/bunker/domain_whitelist_ua],
        [home/recommender/zen/bunker/domain_whitelist_be],
        [home/recommender/zen/bunker/domain_whitelist_ru]
    )
);
'''
