# coding=utf-8

import yt.wrapper as yt
from bm.yt_tools import set_upload_time
import irt.broadmatching.common_options


def get_clear_domain(url):
    clear_domain = (url.split("://")[-1]).split('/')[0]
    if (len(clear_domain) > 4) and (clear_domain[:4] == "www."):
        clear_domain = clear_domain[4:]
    return clear_domain


def get_sec_level_domain(full_domain):
    return '.'.join(full_domain.split('.')[-2:])


class FilterDomain:

    def start(self):
        self.bad_domains = set()
        self.bad_domains_2level = set()

        with open('dyn_bad_domains') as f:
            for line in f:
                self.bad_domains.add(line.strip())
        with open('dyn_bad_domains_2level') as f:
            for line in f:
                self.bad_domains_2level.add(line.strip())

    def __call__(self, row):
        if row.get('statusActive') == 'Yes':

            domain = row.get('domain')
            if domain is None:
                return

            clear_domain = get_clear_domain(domain)
            if (len(clear_domain) > 0) and (clear_domain not in self.bad_domains) and \
               (get_sec_level_domain(clear_domain) not in self.bad_domains_2level):

                yield {'Domain': domain}


def unique(key, rows):
    yield key


def main():
    yt_config = yt.default_config.get_config_from_env()
    yt_config["remote_temp_tables_directory"] = "//home/catalogia/tmp"
    yt_config["spec_defaults"] = {"pool": 'catalogia'}
    yt.config['mount_sandbox_in_tmpfs'] = True
    yt_client = yt.YtClient(config=yt_config)
    result_table = '//home/bannerland/data/dse/preparing/dyn-perf-domains'
    tmp_table = result_table + '-tmp'

    with yt_client.Transaction():
        yt_client.run_map(FilterDomain(), '//home/direct/db/banners',
                          yt.TablePath(name=tmp_table, append=True),
                          local_files=[irt.broadmatching.common_options.get_options()['dicts'] + '/dyn_bad_domains',
                                       irt.broadmatching.common_options.get_options()['dicts'] + '/dyn_bad_domains_2level', ],
                          spec={"combine_chunks": True})

        yt_client.run_sort(tmp_table, sort_by=['Domain'], spec={"combine_chunks": True})

        yt_client.run_reduce(
            unique,
            tmp_table,
            result_table,
            reduce_by=['Domain'],
            spec={"combine_chunks": True})

        yt_client.remove(tmp_table)
        set_upload_time(result_table, yt_client)


if __name__ == '__main__':
    main()
