# coding=utf-8

import filelock
import re

from bm.bmyt import BMYT
from bm.yt_tools import NormalizeBase, set_upload_time, get_upload_time
from irt.common.geobase import geobase_lookup, id_to_region_name
import irt.broadmatching.common_options
import irt.common.unidecode
import yt.wrapper as yt_wrapper

import random
from urlparse import urlparse
import heapq
import logging

import irt.bannerland.options


from irt.broadmatching.generate_dse_banners.tools import (
    filter_freq, join_hits_by_norm, is_bs_compatible,
    phrase_preprocess, prepare_url, normalize_url_strong,
    strong_url_weaken, second_level_domains_equal,
    add_norm_url_to_table, enforce_dyntable_guarantees,
    FirstReducer
)


YT_MAX_ROW_WEIGHT = 128000000
YT_FINAL_MAX_ROW_SIZE = 8 * yt_wrapper.common.MB
YT_HEAVY_ROW_SPEC = {"job_io": {"table_writer": {"max_row_weight": YT_MAX_ROW_WEIGHT}}}

EMPTY_PHRASE = ''

logging.basicConfig(format="%(asctime)s\t[%(process)d]\t%(message)s", level=logging.INFO)

DSSM_CTR_1 = 0.58
DSSM_CTR_NO_MINER_1 = 0.6
DSSM_UTA_1 = 0.6
DSSM_CTR_2 = 0.55
DSSM_CTR_NO_MINER_2 = 0.5
DSSM_UTA_2 = 0.79

URL_AGE_SECONDS = 8000000
URL_LEN_LIMIT = 2048

EXPORT_DOMAINS_TABLE = '//home/bannerland/export/webmaster/dse_domains'


class TODOREMOVEDURINGBMYTREFACTORING(BMYT):
    def __init__(self, **kwargs):
        if kwargs.get('catalogia_spec') is None:
            kwargs['catalogia_spec'] = {}
        catalogia_spec = kwargs['catalogia_spec']
        catalogia_spec.setdefault('lib', {}).setdefault('name', 'bm_bannerland_lib')
        # ресурсы BL: сейчас соответствуют $required_resources{bannerland} в Resources.pm
        # TODO: add res for dyn: dyn_stat
        bl_gendicts = ['generated_dicts', 'models']
        catalogia_spec.setdefault('gendicts', [{'name': n} for n in bl_gendicts])

        kwargs.setdefault('bm_options', irt.bannerland.options.get_options())
        super(TODOREMOVEDURINGBMYTREFACTORING, self).__init__(**kwargs)


def string_aggregate(list, delim="\t"):
    return delim.join(set(x for x in list if len(x) > 0))


class UniDecoderJob(object):
    def start(self):
        self.decoder = irt.common.unidecode.DirectUniDecoder()


# also serves as title preparer
class QueriesUrlNormalizeReducer(UniDecoderJob):
    def __call__(self, key, rows):
        url = prepare_url(key['Host'] + key['Path'])
        if len(url) > URL_LEN_LIMIT:
            return
        strong_url = normalize_url_strong(url)
        if not second_level_domains_equal(strong_url, url):
            return
        titles = set()
        for row in rows:
            if 'Title' in row and row['UrlAgeSeconds'] < URL_AGE_SECONDS:
                titles.add(row['Title'])
            if not ((row['DssmCtr'] > DSSM_CTR_1 and row['DssmCtrNoMiner'] > DSSM_CTR_NO_MINER_1 and row['DssmUta'] > DSSM_UTA_1) or
                    (row['DssmCtr'] > DSSM_CTR_2 and row['DssmCtrNoMiner'] > DSSM_CTR_NO_MINER_2 and row['DssmUta'] > DSSM_UTA_2)):
                continue
            yield {
                '@table_index': 0,
                'url': url,
                'strong_url': strong_url,
                'query': row['Query'],
                'region_id': row['RegionId'],
                'clicks': row['Clicks'],
                'shows': row['Shows'],
                'position': row['Position']
            }
        # prepare titles
        has_title = False
        for title in titles:
            if len(title) == 0:
                continue
            clean_title = self.decoder.decode(title)
            if is_bs_compatible(clean_title):
                yield {
                    'url': url,
                    'strong_url': strong_url,
                    'link_text': clean_title,
                    'source': 'title',
                    '@table_index': 1
                }
                has_title = True
        if not has_title:
            yield {
                'url': url,
                'strong_url': strong_url,
                'link_text': 'true',
                'source': 'empty_title',
                '@table_index': 1
            }


class QueriesNormReducer(NormalizeBase):
    def __init__(self):
        super(QueriesNormReducer, self).__init__(local_files=True)

    def __call__(self, key, rows):
        if not hasattr(self, 'lookup'):
            self.lookup = geobase_lookup()
        # long phrases almost always have zero freq, ignore them
        query = phrase_preprocess(key['query'])
        query_wc = len(query.split())
        if query_wc > 12 or query_wc < 1:
            return
        norm = self.norm_phr(query)
        # normalized phrases with >6 words almost always have zero freq, ignore them
        norm_wc = len(norm.split())
        if norm_wc > 6 or norm_wc < 1:
            return
        region_norm_cache = {}
        for row in rows:
            yield {
                'strong_url': key['strong_url'],
                'url': row['url'],
                'clicks': row['clicks'],
                'position': row['position'],
                'shows': row['shows'],
                'region': row['region_id'],
                'norm': norm,
                'orig': 1
            }
            # adding region to >=6 - word phrase will produce too long phrase, which will almost always have zero freq
            if norm_wc == 6:
                continue
            if row['region_id'] in region_norm_cache:
                region_norm = region_norm_cache[row['region_id']]
            else:
                try:
                    region = id_to_region_name(self.lookup, row['region_id'])
                except:
                    continue
                region_norm = self.norm_phr(query + ' ' + region)
                region_norm_cache[row['region_id']] = region_norm
            yield {
                'strong_url': key['strong_url'],
                'url': row['url'],
                'clicks': row['clicks'],
                'position': row['position'],
                'shows': row['shows'],
                'region': row['region_id'],
                'norm': region_norm,
                'orig': 0
            }


class QueriesPreReducer():
    def __call__(self, key, rows):
        regions = {}
        count = 0
        clicks = 0
        shows = 0
        position = 0
        orig = 0
        urls = {}
        for row in rows:
            clicks += int(row['clicks'])
            shows += int(row['shows'])
            position += int(row['position'])
            regions[row['region']] = 1
            urls[row['url']] = 1
            count += 1
            if int(row['orig']) > 0:
                orig = 1
        if not count:
            return
        position /= count
        result = {
            'clicks': clicks,
            'shows': shows,
            'position': position,
            'orig': orig,
            'urls': urls
        }
        result.update(key)
        regions_list = regions.keys()
        if len(regions_list) == 1:
            result['region'] = regions_list[0]
        yield result


class RawLinkPrepareReducer(UniDecoderJob):
    def __call__(self, key, rows):
        url = prepare_url(key['TargetUrl'])
        if len(url) > URL_LEN_LIMIT:
            return
        strong_url = normalize_url_strong(url)
        if not second_level_domains_equal(strong_url, url):
            return
        titles = set(row['Text'] for row in rows if 'Text' in row)

        for title in titles:
            if len(title) == 0:
                continue
            clean_title = self.decoder.decode(title)
            if is_bs_compatible(clean_title):
                yield {
                    'url': url,
                    'strong_url': strong_url,
                    'link_text': clean_title,
                    'source': 'links_title',
                }


class KWYTPrepareMapper(UniDecoderJob):
    def __call__(self, row):
        if row['Title']:
            url = prepare_url(row['URL'])
            if len(url) > URL_LEN_LIMIT:
                return
            strong_url = normalize_url_strong(url)
            if not second_level_domains_equal(strong_url, url):
                return
            row['Title'] = self.decoder.decode(row['Title'])
            if is_bs_compatible(row['Title']):
                yield {
                    'url': url,
                    'strong_url': strong_url,
                    'link_text': row['Title'],
                    'source': 'kwyt_title',
                }


class LandingPrepareMapper(UniDecoderJob):
    def __call__(self, row):
        if 'LandingPageTitle' in row and 'DirectURL' in row and row['LandingPageTitle'] and row['DirectURL']:
            row['LandingPageTitle'] = self.decoder.decode(row['LandingPageTitle'])
            url = prepare_url(row['DirectURL'])
            if len(url) > URL_LEN_LIMIT:
                return
            strong_url = normalize_url_strong(url)
            if not second_level_domains_equal(strong_url, url):
                return
            if is_bs_compatible(row['LandingPageTitle']):
                yield {
                    'url': url,
                    'strong_url': strong_url,
                    'link_text': row['LandingPageTitle'],
                    'source': 'landing_title',
                }


class LinkPrepareReducer():
    def __call__(self, key, rows):
        strong_url = key['strong_url']
        link_text = {}
        urls = {}
        for row in rows:
            if row['source'] not in link_text:
                link_text[row['source']] = {}
            if len(link_text[row['source']]) < 1000:
                link_text[row['source']][row['link_text']] = 1
            if len(urls) < 1000:
                urls[row['url']] = 1
        # links-only banners do not count as real banners
        # empty urls from raw_queries dont count either
        # but if url was in raw_queries and has links_title, it will do
        if len(link_text) == 1 and any(x in link_text for x in ('links_title', 'empty_title', 'landing_title')):
            return
        if 'empty_title' in link_text:
            del link_text['empty_title']
        res = {'strong_url': strong_url, 'urls': urls}
        for source in link_text.keys():
            text = string_aggregate(link_text[source].keys())
            # длинный текст, как правило, говорит о том, что либо заголовки нерелевантны, либо слишком много разных
            # в обоих случаях в генерацию его лучше не брать
            # также из-за них тормозит маппер process_dse_base_row_title_step
            if len(text) > 0 and len(text) < 10000:
                res[source] = text
        yield res


class UrlsReducer():
    def __call__(self, key, rows):
        strong_url = key['strong_url']
        urls = {}
        for row in rows:
            urls.update(row['urls'])

        yield {'strong_url': strong_url, 'urls': urls}


class AggregateReducer():
    def encode_norm(self, row):
        params_arr = []
        for param in self.norm_format:
            if param not in row:
                row[param] = ''
            params_arr.append(str(row[param]))
        return ':'.join(params_arr)

    def add_pluses_before_stop_words(self, phrase):
        return ' '.join(sorted(map(lambda w: '+' + w if w in self.stop_words else w, phrase.split(' '))))

    def __call__(self, key, rows):
        strong_url = key['strong_url']
        domain = urlparse(key['strong_url']).netloc
        domain = '.'.join(domain.split('.')[-2:])
        norm = {}
        urls = {}
        cnt = 0

        for row in rows:
            if row['@table_index'] == 0:
                urls.update(row['urls'])
                continue
            row['norm'] = row['norm'].decode('utf-8')
            row['norm'] = self.add_pluses_before_stop_words(row['norm'])
            row['norm'] = row['norm'].encode('utf-8')
            row['norm'] = filter_freq(row['norm'], int(row['hits']), int(row['exact_hits']))
            if row['norm'] is not None and len(row['norm']) > 0:
                norm[self.encode_norm(row)] = 1
                cnt = cnt + 1
            if cnt > 50000:
                # split norms to chunks because of string size limit
                yield {
                    'norm': string_aggregate(norm.keys()),
                    'strong_url': strong_url,
                    'urls': urls,
                    'domain': domain,
                }
                cnt = 0
                norm = {}
        if cnt > 0:
            # split norms to chunks because of string size limit
            yield {
                'norm': string_aggregate(norm.keys()),
                'strong_url': strong_url,
                'urls': urls,
                'domain': domain,
            }


class EmptyPhraseLeftJoinReducer():
    def __call__(self, key, rows):
        rows = iter(rows)
        first = next(rows)
        join_dict = {
            '@table_index': 0,
            'bl_phrases': EMPTY_PHRASE,
        }
        if first['@table_index'] != 0:
            first.update(join_dict)
            yield first
        else:
            join_dict['bl_phrases'] = first['bl_phrases']
        for row in rows:
            row.update(join_dict)
            yield row


class InnerJoinReducer():
    def __call__(self, key, recs):
        recs = iter(recs)
        first = next(recs)
        if first['@table_index'] != 0:
            return
        for rec in recs:
            rec.update(first)
            yield rec


class BaseJoinReducer():
    def __call__(self, key, recs):
        recs = iter(recs)
        first = next(recs)
        first_urls = first.pop('urls', {})
        if first['@table_index'] != 0:
            return
        count = 0
        for rec in recs:
            count += 1
            urls = rec.pop('urls', {})
            urls.update(first_urls)
            rec.update(first)
            # мержим дикты с урлами из обоих таблиц и разворачиваем их в отдельные строки
            for url in urls.keys():
                rec['url'] = url
                yield rec
        if count == 0:
            domain = urlparse(key['strong_url']).netloc
            domain = '.'.join(domain.split('.')[-2:])
            first['domain'] = domain
            first['norm'] = ''
            for url in first_urls.keys():
                first['url'] = url
                yield first


class FilterMainpagesReducer():
    def __call__(self, key, rows):
        mainpage_rows = []
        count = 0
        limit = 10
        min_length = float("inf")
        for row in rows:
            # mainpage will always have minimal length among all urls
            if len(row['url']) <= min_length:
                parsed_url = urlparse(row['url'])
                # remember minimum length unless it's yandex (because all yandex urls with zero path need to be filtered)
                # other urls are filtered only in case of true main page (http://example.org/)
                is_yandex = re.search('(^|\.)yandex\.ru$', parsed_url.netloc)
                if not is_yandex:
                    min_length = len(row['url']) + 1
                if (len(parsed_url.path) <= 1 and (is_yandex or len(parsed_url.query) == 0)):
                    mainpage_rows.append(row)
                    continue
            yield row
            count += 1
        if (count < limit):
            for row in mainpage_rows:
                yield row


class PhrasesDupMaxReducer():
    def __call__(self, key, rows):
        max = 0
        for row in rows:
            if max < row['dup_rank']:
                max = row['dup_rank']
        yield {
            'domain': key['domain'],
            'phrase': key['phrase'],
            'max_dup_rank': max,
        }


class PhrasesDupJoinReducer():
    def __call__(self, key, rows):
        rows = iter(rows)
        first = next(rows)
        if first['@table_index'] != 0:
            return
        for row in rows:
            row.update(first)
            if abs(row['dup_rank'] - row['max_dup_rank']) < 1e-9:
                row['duplicate'] = 0
            else:
                row['duplicate'] = 1
            yield row


class PhrasesFinalReducer():
    def __init__(self, limit=None):
        self.limit = limit

    def __call__(self, key, rows):
        count = 0

        phrases_heap = []
        for row in rows:
            if (self.limit is None) or (count < self.limit):
                # duplicate is reversed to make duplicate entries pop first
                heapq.heappush(phrases_heap, [1 - row['duplicate'], row['rank'], row['phrase']])
            else:
                # pop lowest element to keep only top N elements in heap
                heapq.heappushpop(phrases_heap, [1 - row['duplicate'], row['rank'], row['phrase']])
            count += 1
        sorted_phrases = [heapq.heappop(phrases_heap) for i in range(len(phrases_heap))]
        sorted_phrases.reverse()
        phrases_str = ','.join(list(map(lambda x: ':'.join([x[2], str(x[1]), str(1-x[0])]), sorted_phrases)))
        yield {
            'domain': key['domain'],
            'weak_cluster_key': key['weak_cluster_key'],
            'strong_cluster_key': key['strong_cluster_key'],
            'url': key['url'],
            'bl_phrases': phrases_str,
        }


class RepresentativeSplitReducer():
    def __call__(self, key, rows):
        first = True
        for row in rows:
            if first:
                first = False
                categ_representative = {
                    'title': row['title']
                }
                categ_representative.update(key)
                categ_representative['@table_index'] = 0
                yield categ_representative
            if row['norm'] != '':
                row['@table_index'] = 1
                yield row


class DomainStatMapper():
    def start(self):
        self.stat = {}

    def finish(self):
        for domain in self.stat.keys():
            yield self.stat[domain]

    def __call__(self, row):
        if row['domain'] not in self.stat:
            self.stat[row['domain']] = {
                'domain': row['domain'],
                'all_banners': 0,
                'empty_banners': 0
            }
        self.stat[row['domain']]['all_banners'] += 1
        if row['bl_phrases'] == EMPTY_PHRASE:
            self.stat[row['domain']]['empty_banners'] += 1
        if len(self.stat) > 10000:
            for domain in self.stat.keys():
                yield self.stat[domain]
            self.stat = {}


class DomainStatReducer():
    def __call__(self, key, rows):
        result = {
            'domain': key['domain'],
            'all_banners': 0,
            'empty_banners': 0
        }
        for row in rows:
            result['all_banners'] += row['all_banners']
            result['empty_banners'] += row['empty_banners']
        yield result


class DomainStatFilterMapper():
    def __call__(self, row):
        if row['domain'] in self.domain_filter_dict:
            filters = self.domain_filter_dict[row['domain']]
            if 'remove_mainpage' in filters and re.search(row['domain']+'/$', row['url']):
                return
            if 'empty_banner_prob' in filters and row['bl_phrases'] == EMPTY_PHRASE and random.uniform(0, 1) > filters['empty_banner_prob']:
                return
        yield row


class FinalTableMapper():
    def __call__(self, row):
        result = {
            'domain': row['domain'],
            'url': row['url'],
            'norm_url': row['norm_url'],
            'bl_title': row['title'],
            'bl_title_perf': row['title_perf'],
            'bl_long_title': row['long_title'],
            'bl_phrases': row['bl_phrases'],
            'name': row['title'] + ' ' + ' '.join(map(lambda x: x.split(':')[0], row['bl_phrases'].split(',')[:3])),
            'flags': row['flags'],
        }
        yield result


def main():
    yt_meta_spec_large = {
        'max_cpu': 2000,
        'bm_layers': 24,
    }

    yt_meta_spec_normal = {
        'max_cpu': 2000,
        'bm_layers': 4,
    }

    bmyt_cl = TODOREMOVEDURINGBMYTREFACTORING(
        process_count=1,
        yt_meta_spec=yt_meta_spec_normal,
    )
    yt = bmyt_cl.yt_client

    bmyt_cl_python_large = TODOREMOVEDURINGBMYTREFACTORING(
        process_count=1,
        yt_meta_spec=yt_meta_spec_large,
    )
    yt_python_large = bmyt_cl_python_large.yt_client

    bmyt_cl_perl_postprocess = TODOREMOVEDURINGBMYTREFACTORING(
        process_count=4,
        yt_meta_spec=yt_meta_spec_large,
    )
    yt_perl_postprocess = bmyt_cl_perl_postprocess.yt_client

    dse_params = irt.bannerland.options.get_option('DSE_Banners_params')
    source_queries = irt.broadmatching.common_options.get_options()['DynSources']['queries']['yt_raw_table']

    source_links = irt.broadmatching.common_options.get_options()['DynSources']['links']['yt_filtered_table']
    landing_titles = dse_params['landing_titles']
    kwyt_titles = dse_params['kwyt_titles']

    dse_base = dse_params['dse_base']

    dse_banners = irt.broadmatching.common_options.get_options()['DynSources']['dse']['yt_path_domain']
    norm_format = dse_params['norm_format']

    dict_stopwords = irt.broadmatching.common_options.get_options()['dict_stopwords']

    generate_dse_base = True

    try:
        base_upload_time = get_upload_time(dse_base)
        dse_banners_upload_time = get_upload_time(dse_banners)
        if base_upload_time > dse_banners_upload_time:
            generate_dse_base = False
            logging.info("dse_base is newer than dse_banners, proceed with dse_banners generation")
    except Exception as e:
        logging.exception("get_upload_time failed, proceed with dse_base generation. Exception: %s", str(e))

    if generate_dse_base:
        logging.info("dse_base generation started")
        with yt.Transaction() as shared_tx:
            with yt.TempTable() as norm_url_queries, \
                    yt.TempTable() as norm_queries, \
                    yt.TempTable() as joined_searchcount, \
                    yt.TempTable() as links_intermediate, \
                    yt.TempTable() as dse_queries, \
                    yt.TempTable() as prepared_links, \
                    yt.TempTable() as aggregated_urls:

                with yt_python_large.Transaction(transaction_id=shared_tx.transaction_id, ping=False):
                    yt_python_large.run_reduce(
                        QueriesUrlNormalizeReducer(),
                        source_queries+'{Host,Path,Query,Title,RegionId,Clicks,Shows,Position,UrlAgeSeconds,DssmCtr,DssmCtrNoMiner,DssmUta}',
                        [norm_url_queries, links_intermediate],
                        reduce_by=['Host', 'Path'],
                        format=yt_wrapper.YsonFormat(control_attributes_mode="row_fields")
                    )

                    strong_url_weaken(norm_url_queries, norm_url_queries, yt)

                    yt.run_map_reduce(
                        None,
                        QueriesNormReducer(),
                        norm_url_queries+'{url,strong_url,query,region_id,clicks,shows,position}',
                        norm_queries,
                        reduce_by=['strong_url', 'query'],
                        reduce_local_files=[QueriesNormReducer().files],
                        format=yt_wrapper.YsonFormat(control_attributes_mode="row_fields"),
                        spec={
                            "reducer": {
                                "memory_limit": 2 * 2 ** 30,
                                "tmpfs_path": ".",
                                "copy_files": True,
                            },
                        },
                    )

                    yt.run_sort(norm_queries, sort_by=['norm', 'strong_url'])
                    yt.run_reduce(
                        QueriesPreReducer(),
                        norm_queries,
                        '<sorted_by=[norm]>' + joined_searchcount,
                        reduce_by=['norm', 'strong_url'],
                        format=yt_wrapper.YsonFormat(control_attributes_mode="row_fields"),
                    )

                    join_hits_by_norm(joined_searchcount, yt)

                    yt.run_sort(joined_searchcount, sort_by=['strong_url'])

                    yt.run_reduce(
                        UrlsReducer(),
                        joined_searchcount,
                        '<sorted_by=[strong_url]>' + aggregated_urls,
                        reduce_by=['strong_url'],
                        format=yt_wrapper.YsonFormat(control_attributes_mode="row_fields"),
                    )

                    aggregate_reducer = AggregateReducer()
                    aggregate_reducer.norm_format = norm_format
                    with open(dict_stopwords, "r") as f:
                        aggregate_reducer.stop_words = {line.decode('utf-8').strip() for line in f}
                    yt.run_reduce(
                        aggregate_reducer,
                        [aggregated_urls, joined_searchcount],
                        '<sorted_by=[strong_url]>' + dse_queries,
                        reduce_by=['strong_url'],
                        spec=YT_HEAVY_ROW_SPEC,
                        format=yt_wrapper.YsonFormat(control_attributes_mode="row_fields"),
                    )

                with yt_python_large.Transaction(transaction_id=shared_tx.transaction_id, ping=False):
                    # raw_queries and raw_links are reduced by md5 because of key size limit and lots of repeating titles
                    # native titles are already there
                    # add links
                    yt_python_large.run_map_reduce(
                        None,
                        RawLinkPrepareReducer(),
                        source_links + '{TargetUrl,Text}',
                        '<append=true>' + links_intermediate,
                        reduce_by=['TargetUrl'],
                        format=yt_wrapper.YsonFormat(control_attributes_mode="row_fields"),
                    )

                    # add kwyt
                    yt.run_map(
                        KWYTPrepareMapper(),
                        kwyt_titles,
                        '<append=true>' + links_intermediate,
                    )

                    # add landing_titles

                    yt.run_map(
                        LandingPrepareMapper(),
                        landing_titles,
                        '<append=true>' + links_intermediate,
                    )

                    strong_url_weaken(links_intermediate, links_intermediate, yt)

                    yt_python_large.run_sort(
                        links_intermediate,
                        sort_by=['strong_url']
                    )
                    yt_python_large.run_reduce(
                        LinkPrepareReducer(),
                        links_intermediate,
                        '<sorted_by=[strong_url]>' + prepared_links,
                        reduce_by=['strong_url'],
                        spec=YT_HEAVY_ROW_SPEC,
                        format=yt_wrapper.YsonFormat(control_attributes_mode="row_fields"),
                    )

                yt.run_reduce(
                    BaseJoinReducer(),
                    [prepared_links, dse_queries],
                    dse_base,
                    reduce_by=['strong_url'],
                    spec=YT_HEAVY_ROW_SPEC,
                    format=yt_wrapper.YsonFormat(control_attributes_mode="row_fields"),
                )

                set_upload_time(dse_base, yt)
        logging.info("dse_base generation finished")

    logging.info("dse_banners generation started")
    with yt.Transaction() as shared_tx:
        with yt.TempTable() as dse_base_titles, \
                yt.TempTable() as phrases, \
                yt.TempTable() as categ_representatives, \
                yt.TempTable() as categ_representatives_processed, \
                yt.TempTable() as phrase_banners, \
                yt.TempTable() as phrase_banners_categs, \
                yt.TempTable() as max_dup_rank, \
                yt.TempTable() as phrases_dup, \
                yt.TempTable() as phrases_final, \
                yt.TempTable() as categorized_empty_banners, \
                yt.TempTable() as dse_banners_dummy_unfiltered, \
                yt.TempTable() as dse_banners_dummy_filtered, \
                yt.TempTable() as dse_banners_with_norm_url, \
                yt.TempTable() as dse_banners_with_guarantees, \
                yt.TempTable() as domain_stat, \
                yt.TempTable() as dse_banners_dyn_unsorted:

            if True:
                perl_postprocess_title_begin = """
                    use Encode qw(FB_CROAK decode);
                    use BM::BannersMaker::BannerLandProject;
                    my $proj = BM::BannersMaker::BannerLandProject->new({
                        load_dicts                              => 1,
                    });

                    $self->{proj} = $proj;
                """

                perl_postprocess_title_main = """
                    eval { decode( 'UTF-8', $r->{url}, FB_CROAK ) } or return;
                    for my $field (qw(title kwyt_title landing_title links_title)) {
                        next;
                        return if defined $r->{$field} && length($r->{$field}) > 10000;
                    }
                    $self->{proj}->dse_tools->process_dse_base_row_title_step($r, sub{ my $result = shift; yield($result => OUTPUT_TABLE) } );
                """

                perl_postprocess_title_mapper = {
                    'begin': perl_postprocess_title_begin,
                    'mapper': perl_postprocess_title_main,
                    'end': """
                        $self->{proj}->log("done")
                    """,
                    'dst_names': ['OUTPUT_TABLE'],
                    'dst_fields': [{'domain': str, 'url': str, 'title': str, 'title_perf': str, 'long_title': str, 'norm': str, 'weak_cluster_key': str, 'strong_cluster_key': str}]
                }

                with yt_perl_postprocess.Transaction(transaction_id=shared_tx.transaction_id, ping=False):
                    bmyt_cl_perl_postprocess.run_bm_map(
                        perl_postprocess_title_mapper,
                        dse_base,
                        dse_base_titles,
                        data_size_per_job=200000000,
                    )
                yt.run_sort(dse_base_titles, sort_by=['domain', 'weak_cluster_key', 'strong_cluster_key'])
                yt.run_reduce(
                    RepresentativeSplitReducer(),
                    dse_base_titles,
                    [categ_representatives, phrase_banners],
                    reduce_by=['domain', 'weak_cluster_key'],
                    format=yt_wrapper.YsonFormat(control_attributes_mode="row_fields"),
                )

                perl_postprocess_categ_begin = """
                    use BM::BannersMaker::BannerLandProject;
                    my $proj = BM::BannersMaker::BannerLandProject->new({
                        load_dicts                              => 1,
                        load_minicategs_light                   => 1,
                        allow_lazy_dicts                        => 1,
                        use_comptrie_subphraser                 => 1,
                        use_sandbox_categories_suppression_dict => 1,
                    });
                    $proj->categs_tree->never_read_categs_cache(1);
                    $proj->categs_tree->never_write_categs_cache(1);

                    $self->{proj} = $proj;
                """

                perl_postprocess_categ_main = """
                    $self->{proj}->dse_tools->process_dse_base_row_categ_step($r, sub{ my $result = shift; yield($result => OUTPUT_TABLE) } );
                """

                perl_postprocess_categ_mapper = {
                    'begin': perl_postprocess_categ_begin,
                    'mapper': perl_postprocess_categ_main,
                    'dst_names': ['OUTPUT_TABLE'],
                    'dst_fields': [{'domain': str, 'weak_cluster_key': str, 'categs': str, 'flags': str}],
                }

                with yt_perl_postprocess.Transaction(transaction_id=shared_tx.transaction_id, ping=False):
                    bmyt_cl_perl_postprocess.run_bm_map(
                        perl_postprocess_categ_mapper,
                        categ_representatives,
                        categ_representatives_processed,
                    )
                yt.run_sort(categ_representatives_processed, sort_by=['domain', 'weak_cluster_key'])
                yt.run_sort(phrase_banners, sort_by=['domain', 'weak_cluster_key'])

                yt.run_reduce(
                    InnerJoinReducer(),
                    [categ_representatives_processed, phrase_banners],
                    phrase_banners_categs,
                    reduce_by=['domain', 'weak_cluster_key'],
                    format=yt_wrapper.YsonFormat(control_attributes_mode="row_fields"),
                )

                perl_postprocess_phrases_begin = """
                    use BM::BannersMaker::BannerLandProject;
                    my $proj = BM::BannersMaker::BannerLandProject->new({
                        load_dicts                              => 1,
                        load_minicategs_light                   => 1,
                        allow_lazy_dicts                        => 1,
                        use_comptrie_subphraser                 => 1,
                        use_sandbox_categories_suppression_dict => 1,
                    });
                    $proj->categs_tree->never_read_categs_cache(1);
                    $proj->categs_tree->never_write_categs_cache(1);

                    $self->{proj} = $proj;
                """

                perl_postprocess_phrases_main = """
                    $self->{proj}->dse_tools->process_dse_base_row_phrases_step($r, sub{ my $result = shift; yield($result => OUTPUT_TABLE) } );
                """

                perl_postprocess_phrases_mapper = {
                    'begin': perl_postprocess_phrases_begin,
                    'mapper': perl_postprocess_phrases_main,
                    'dst_names': ['OUTPUT_TABLE'],
                    'dst_fields': [{'domain': str, 'weak_cluster_key': str, 'strong_cluster_key': str, 'url': str, 'phrase': str, 'rank': float, 'dup_rank': float}],
                }
                with yt_perl_postprocess.Transaction(transaction_id=shared_tx.transaction_id, ping=False):
                    bmyt_cl_perl_postprocess.run_bm_map(
                        perl_postprocess_phrases_mapper,
                        phrase_banners_categs,
                        phrases,
                    )

                yt.run_sort(phrases, sort_by=['domain', 'phrase'])

                yt.run_reduce(
                    PhrasesDupMaxReducer(),
                    phrases+'{domain,phrase,dup_rank}',
                    '<sorted_by=[domain;phrase]>'+max_dup_rank,
                    reduce_by=['domain', 'phrase'],
                    format=yt_wrapper.YsonFormat(control_attributes_mode="row_fields"),
                )

                yt.run_reduce(
                    PhrasesDupJoinReducer(),
                    [max_dup_rank, phrases],
                    phrases_dup,
                    reduce_by=['domain', 'phrase'],
                    format=yt_wrapper.YsonFormat(control_attributes_mode="row_fields"),
                )
                yt.run_sort(phrases_dup, sort_by=['domain', 'weak_cluster_key', 'strong_cluster_key', 'url'])
                yt.run_reduce(
                    PhrasesFinalReducer(limit=dse_params['max_phrases_per_url']),
                    phrases_dup,
                    '<sorted_by=[domain;weak_cluster_key;strong_cluster_key;url]>'+phrases_final,
                    reduce_by=['domain', 'weak_cluster_key', 'strong_cluster_key', 'url'],
                    format=yt_wrapper.YsonFormat(control_attributes_mode="row_fields"),
                )
                yt.run_reduce(
                    InnerJoinReducer(),
                    [categ_representatives_processed, dse_base_titles+'{domain,weak_cluster_key,strong_cluster_key,url,title,title_perf,long_title}'],
                    categorized_empty_banners,
                    reduce_by=['domain', 'weak_cluster_key'],
                    format=yt_wrapper.YsonFormat(control_attributes_mode="row_fields"),
                )
                yt.run_sort(categorized_empty_banners, sort_by=['domain', 'weak_cluster_key', 'strong_cluster_key', 'url'])
                yt.run_reduce(
                    EmptyPhraseLeftJoinReducer(),
                    [phrases_final, categorized_empty_banners],
                    dse_banners_dummy_unfiltered,
                    reduce_by=['domain', 'weak_cluster_key', 'strong_cluster_key', 'url'],
                    format=yt_wrapper.YsonFormat(control_attributes_mode="row_fields"),
                )
                yt.run_map_reduce(
                    DomainStatMapper(),
                    DomainStatReducer(),
                    dse_banners_dummy_unfiltered+'{domain,bl_phrases}',
                    domain_stat,
                    reduce_by=['domain'],
                    reduce_combiner=DomainStatReducer(),
                    format=yt_wrapper.YsonFormat(control_attributes_mode="row_fields"),
                )
            domain_filter_dict = {}
            for row in yt.read_table(domain_stat):
                row_processed = {}
                if row['all_banners'] > 10 or row['domain'] == 'yandex.ru':
                    row_processed['remove_mainpage'] = 1
                if row['empty_banners'] > 10000000:
                    row_processed['empty_banner_prob'] = 10000000.0 / row['empty_banners']
                if len(row_processed.keys()):
                    domain_filter_dict[row['domain']] = row_processed
            domain_stat_filter_mapper = DomainStatFilterMapper()
            domain_stat_filter_mapper.domain_filter_dict = domain_filter_dict
            yt.run_map(
                domain_stat_filter_mapper,
                dse_banners_dummy_unfiltered,
                dse_banners_dummy_filtered,
            )

            with yt_perl_postprocess.Transaction(transaction_id=shared_tx.transaction_id, ping=False):
                add_norm_url_to_table(
                    dse_banners_dummy_filtered,
                    dse_banners_with_norm_url,
                    bmyt_client=bmyt_cl_perl_postprocess
                )

            enforce_dyntable_guarantees(
                dse_banners_with_norm_url,
                dse_banners_with_guarantees,
                key=['norm_url'],
                max_row_size=YT_FINAL_MAX_ROW_SIZE,
                yt_client=yt
            )

            final_schema = '<schema=[' + ';'.join(map(lambda x: '{name=' + x + '; type=string' + '}',
                                                      ['domain', 'url', 'norm_url', 'bl_title', 'bl_title_perf', 'bl_long_title',
                                                       'bl_phrases', 'name', 'flags'])) + ']>'
            yt.run_map(
                FinalTableMapper(),
                dse_banners_with_guarantees,
                [final_schema + dse_banners_dyn_unsorted],
                format=yt_wrapper.YsonFormat(control_attributes_mode="row_fields"),
            )
            yt.run_sort(final_schema + dse_banners_dyn_unsorted, final_schema + dse_banners, sort_by=['domain'])
            set_upload_time(dse_banners, yt)

    logging.info("dse_banners generation finished")

    logging.info("dse domains export started")

    def get_domain(row):
        domain = urlparse(row['url']).netloc
        if domain:
            yield {
                'Domain': domain
            }

    yt.run_map_reduce(
        get_domain,
        FirstReducer(),
        yt.TablePath(dse_banners, columns=['url']),
        EXPORT_DOMAINS_TABLE,
        reduce_by=['Domain']
    )

    logging.info("dse domains export finished")


if __name__ == '__main__':
    try:
        with filelock.FileLock('generate_dse_banners').acquire(timeout=0):
            main()
    except filelock.Timeout:
        logging.warning('Another instance of this application currently holds the lock.')
