# coding=utf-8
import argparse
import base64
import codecs
import hashlib
import json
import os
from collections import defaultdict

from django.core.management import setup_environ
import yt.wrapper as yt
from yql.api.v1.client import YqlClient

import settings

setup_environ(settings)

from main.models import *


class JsonResportBulder(object):
    MAX_EXAMPLES_TO_SHOW = 5

    ROOT_NAME = '__root__'

    # https://yql.yandex-team.ru/Operations/XXQJJFPzVAfkGI1uc9aocVuAghD8fwx71lXluqU8KYw=
    ROOT_FREQ = {
        'freq_yandex': 109986,
        'freq_beast': 480468
    }

    NORMALIZED_NAMES = {
        u'ОО': u'Объектный ответ',
        u'Фактовый': u'Фактовый ответ',
        u'фактовый ответ': u'Фактовый ответ',
        u'спецсниппет': u'Спецсниппет',
        u'Сниппет с характеристиками продукта': u'Сниппет с оценкой',
        u'Голосование в ОО': u'ОО: голосование',
        u'Карта': u'гео: многоорг',
        u'Колдунщик: место-карта': u'гео: многоорг',
        u'Колдунщик: отели': u'гео: отели',
        u'факт из ОО': u'ОО: фактовый запрос',
        u'реклама': u'Реклама',
        u'Реклама с сайтлинками': u'Реклама: с сайтлинками',
        u'реклама google play': u'Реклама: google play',
        u'реклама с адресом': u'Реклама: с адресом',
        u'Реклама со вставкой': u'Реклама: со вставкой',
        u'Сниппет с картинками': u'Спецсниппет: галерея',
        u'фактовый ответ с похожими запросами': u'ОО: с похожими запросами',
        u'ОО с иллюстрацией': u'ОО: с иллюстрацией',
        u'Сниппет с фото': u'ОО: с иллюстрацией',
        u'ОО со связанными запросами': u'ОО: со связанными запросами',
        u'колдунщик погоды': u'Колдунщик погоды'
    }

    def __init__(self, run_name, raw_skel_table, yandex_query_freq_table, beast_query_freq_table, yt_prefix):
        self.run_name = run_name
        self.raw_skel_table = raw_skel_table
        self.yandex_query_freq_table = yandex_query_freq_table
        self.beast_query_freq_table = beast_query_freq_table
        self.yt_prefix = yt_prefix

    def get_name_prefixes(self, name):
        name = name.strip()
        name = self.NORMALIZED_NAMES.get(name, name)
        name_split = name.strip().split(':')
        for i, name_part in enumerate(name_split):
            name_norm = name_part.strip()
            name_norm = self.NORMALIZED_NAMES.get(name_norm, name_norm)
            name_split[i] = name_norm
            name = name[0].upper() + name[1:]
        name_prefixes = [u': '.join(name_split[:i + 1]) for i in range(len(name_split))]
        return name_prefixes

    def eval_skeleton_md5(self, skeleton):
        # same as SerpParser.py SerpParser.eval_skeleton_md5
        if isinstance(skeleton, unicode):
            skeleton = skeleton.encode('utf8')
        assert isinstance(skeleton, str)
        return base64.b64encode(hashlib.md5(skeleton).digest())

    def eval_gen_feature_freq(self):
        gen_feature_freq_json_fname = 'gen_feature2freq.json'
        if os.path.exists(gen_feature_freq_json_fname):
            print time.ctime(), 'get gen_feature_freq_table from cache', gen_feature_freq_json_fname
            with open(gen_feature_freq_json_fname) as f:
                return json.load(f)

        gen_feature_freq_table = '//home/shinyserp/irlab/SNIPPETS-7735_anatomy/20190829/20190827_mixed_sample__gen_feature_freq'
        if not yt.exists(gen_feature_freq_table):
            print time.ctime(), 'build gen_feature_freq_table', gen_feature_freq_table
            gen_feature_skel = []
            search_engine = SearchEngine.objects.get(name=self.run_name)
            for eg in ElementsGroup.objects.filter(search=search_engine):
                if not eg.feature:
                    # print eg
                    continue

                for gen_feature in self.get_name_prefixes(eg.feature.name):
                    gen_feature_skel.append({
                        'gen_feature': gen_feature,
                        'level': len(gen_feature.split(':')),
                        'feature': eg.feature.name,
                        'skeleton_md5': self.eval_skeleton_md5(eg.skeleton),
                    })
            gen_feature_skel_table = self.yt_prefix + '_gen_feature_skel'
            print time.ctime(), 'write', gen_feature_skel_table
            yt.write_table(gen_feature_skel_table, gen_feature_skel)

            # https://yql.yandex-team.ru/Operations/XXQC0Z9LnnqVzu5h21elhNuXzqT2WTkz_L0X3ZgQ1v8=
            yql = """
                use arnold;
                pragma yt.UseColumnarStatistics="0";
                pragma yt.ForceInferSchema='10';

                $gen_feature2query =
                select gen_feature, query
                  from `//home/shinyserp/irlab/SNIPPETS-7735_anatomy/20190829/20190827_mixed_sample__gen_feature_skel` as fs
                    join `//home/shinyserp/irlab/SNIPPETS-7735_anatomy/20190829/20190827_mixed_sample_skel` as skel on fs.skeleton_md5=skel.skeleton_md5
                  group by fs.gen_feature as gen_feature, skel.query_text as query
                ;

                insert into `//home/shinyserp/irlab/SNIPPETS-7735_anatomy/20190829/20190827_mixed_sample__gen_feature_freq` with truncate
                select gen_feature,
                    sum(qfreq_yandex.freq ?? 0) as freq_yandex,
                    sum(if(qfreq_beast.key is not null, 1,0)) as freq_beast
                  from $gen_feature2query as q
                    left join `//home/shinyserp/irlab/SNIPPETS-7735_anatomy/query_sample_201905-07_100k` as qfreq_yandex on q.query=qfreq_yandex.query
                    left join `//home/shinyserp/irlab/SNIPPETS-7735_anatomy/20190829/20190821_html` as qfreq_beast on q.query=qfreq_beast.key
                  group by q.gen_feature as gen_feature
                ;

                select sum(freq) as cnt_queries
                  from `//home/shinyserp/irlab/SNIPPETS-7735_anatomy/query_sample_201905-07_100k`
                  into result `count queries yandex`
                ;

                select count(1) as cnt_queries
                  from `//home/shinyserp/irlab/SNIPPETS-7735_anatomy/20190829/20190821_html`
                  into result `count queries beast`
                ;
            """
            yql_client = YqlClient(token_path=os.path.expanduser('~/.yql/token'))
            yql_operation = yql_client.query(yql, syntax_version=1)
            print 'Running YQL operation'
            yql_operation.run()
            print '\tShared url:', yql_operation.share_url
            yql_operation.wait_progress()
            if yql_operation.is_success:
                print('\tCompleted')
            else:
                raise RuntimeError('YQL operation failed: {}\n{}'.format(
                    yql_operation.share_url,
                    '\n'.join(map(str, yql_operation.errors)),
                ))

        gen_feature2freq = dict()
        print time.ctime(), 'read gen_feature_freq_table', gen_feature_freq_table
        for rec in yt.read_table(gen_feature_freq_table):
            gen_feature2freq[rec['gen_feature']] = dict(
                freq_yandex=rec['freq_yandex'],
                freq_beast=rec['freq_beast'],
            )

        print time.ctime(), 'save gen_feature_freq_table to cache', gen_feature_freq_json_fname
        with open(gen_feature_freq_json_fname, 'w') as fo:
            json.dump(gen_feature2freq, fo, indent=4)

        return gen_feature2freq

    def build_json_report(self):
        gen_feature2freq = self.eval_gen_feature_freq()
        gen_feature2freq[self.ROOT_NAME] = self.ROOT_FREQ

        gen_feature2subfeats = defaultdict(set)
        gen_feature2examples = defaultdict(list)
        search_engine = SearchEngine.objects.get(name=self.run_name)
        print time.ctime(), 'fetch all element groups, combine them to gen_feature'
        for eg in ElementsGroup.objects.filter(search=search_engine):
            if not eg.feature:
                continue

            feature_name_level = len(eg.feature.name.split(':'))
            top_level_feature = self.ROOT_NAME
            for gen_feature_level, gen_feature in enumerate(self.get_name_prefixes(eg.feature.name)):
                gen_feature2subfeats[top_level_feature].add(gen_feature)
                top_level_feature = gen_feature

                for example in Example.objects.filter(elements_group=eg):
                    example_json = dict(
                        query_text=example.query_text,
                        element_id=example.element_id,
                        serp_url_static=example.serp_url_static,
                        screenshot_url_static=example.screenshot_url_static
                    )
                    if feature_name_level <= 2 or gen_feature_level >= 2:
                        gen_feature2examples[gen_feature].append(example_json)

        gen_feature_name_set = set()

        def gen_feature2json(gen_feature):
            id = len(gen_feature_name_set)
            gen_feature_name_set.add(gen_feature)
            examples = sorted(gen_feature2examples[gen_feature],
                              key=lambda f: (len(f['query_text']) > 100, f['element_id']))
            examples = examples[:self.MAX_EXAMPLES_TO_SHOW]
            subfeatures = [gen_feature2json(f) for f in gen_feature2subfeats[gen_feature]]
            subfeatures.sort(key=lambda f: f['freq_yandex'], reverse=True)
            return dict(
                id=id,
                name=gen_feature,
                level=0 if gen_feature == self.ROOT_NAME else len(gen_feature.split(':')),
                freq_yandex=gen_feature2freq[gen_feature]['freq_yandex'],
                freq_beast=gen_feature2freq[gen_feature]['freq_beast'],
                coverage_yandex=100. * gen_feature2freq[gen_feature]['freq_yandex'] / self.ROOT_FREQ['freq_yandex'],
                coverage_beast=100. * gen_feature2freq[gen_feature]['freq_beast'] / self.ROOT_FREQ['freq_beast'],
                examples=examples,
                subfeatures_count=len(subfeatures),
                subfeatures=subfeatures,
            )

        print time.ctime(), 'make report'
        report = {}
        report['features'] = gen_feature2json(self.ROOT_NAME)
        print time.ctime(), 'done', len(gen_feature_name_set), 'tags'
        return report


if __name__ == '__main__':
    json_report_builder = JsonResportBulder(
        run_name='GoogleRu_20190827_mixed_sample',
        raw_skel_table='//home/shinyserp/irlab/SNIPPETS-7735_anatomy/20190829/20190827_mixed_sample_raw_skel',
        yandex_query_freq_table=dict(
            table_name='//home/shinyserp/irlab/SNIPPETS-7735_anatomy/query_sample_201905-07_100k',
            query_field='query',
            freq_field='freq'
        ),
        beast_query_freq_table=dict(
            table_name='//home/shinyserp/irlab/SNIPPETS-7735_anatomy/20190829/20190821_html',
            query_field='key',
            freq_field=None
        ),
        yt_prefix='//home/shinyserp/irlab/SNIPPETS-7735_anatomy/20190829/20190827_mixed_sample_'
    )
    json_report = json_report_builder.build_json_report()
    with codecs.open('../media/anatomy_report_201909.json', 'w', encoding='utf8') as fo:
        json.dump(json_report, fo, indent=4, ensure_ascii=False, sort_keys=True)

    # print json.dumps(json_report, indent=4, ensure_ascii=False)

# input:
#    competitors.db
#       tag
#       skeleton
#
#    //home/shinyserp/irlab/SNIPPETS-7735_anatomy/20190829/20190827_mixed_sample_skel
#       180G data
#       skeleton_md5
#       query_text
#
#    //home/shinyserp/irlab/SNIPPETS-7735_anatomy/query_sample_201905-07_100k
#       семпл из потока Яндекса
#       query
#       freq
#
#    //home/shinyserp/irlab/SNIPPETS-7735_anatomy/20190829/20190821_html
#       семпл по бистам
#       key = query
#
#
# output: table like
#    https://wiki.yandex-team.ru/users/irlab/Anatomija-zarubezhnyx-konkurentov/testresulttable2/
#    фича верхнего уровня
#    примеры запросов
#    примеры скриншотов (можно ссылкой)
#    покрытие по потоку Яндекса
#    покрытие по beast (similarweb)
#    количество запросов, попавших в разметку
#    поле для комментариев
