# -*- coding: utf-8 -*-


import nile
import datetime
import json
import imp
import math
import uatraits
import re

from qb2.api.v1.typing import *
from functools import partial
from collections import defaultdict
from urlparse import urlparse, parse_qsl
from urllib import unquote

from nile.api.v1 import (
    clusters,
    Record,
    aggregators as na,
    extractors as ne,
    filters as nf,
    cli,
    with_hints,
)

OUTPUT_PATH = '//home/images/analytics/images_commercial_cube'
INPUT_PATH = '//home/images/analytics/images_cubes/'

SCHEMA = {
    'uid': String,  # yandexuid
    'reqid': String,  # request id
    'serpid': String,  # serp id
    'query': String,  # user query
    'ui': String,  # desktop|mobile|touch|app|pad|other
    'region': Integer,  # region in number format
    'fielddate': String,  # request date
    'interior': Float,  # IMAGES.ImgQueryFactors.QInteriorProbV1
    'qcpv3': Float,  # IMAGES.ImgQueryFactors.QCommercialProbV3
    'prism_cluster': String,
    'prism_segment': String,
    'shows': List[Struct["placement": String, "datasource": String, "shows": Integer]],
    'clicks': List[Struct["placement": String, "datasource": String, "clicks": Integer]],
    'shown_urls': List[Struct["placement": String, "datasource": String, "urls": List[String]]],
    'clicked_urls': List[Struct["placement": String, "datasource": String, "urls": List[String]]],
}

ACTION_PATTERNS = {
    "PATTERN_GREENURL": re.compile(
        r"^/image/.*/(site|url|title|link|button|snippet|collections|commercial/(incut/)?(similar|description|related|text|contacts|sitelink|((incut|behavioral)/)?thumb)|(market_offers|direct|polaroid/market)/click|(duplicates/(rating|price|model-rating|button|title|url)/(market|schemaorg)))")
}

GREENURL_EXCEPTIONS = ["show", "scroll", "close", "more", "expand", "collapse", "shown", "other/view", "sites/load",
                       "cbir/similar/button", "sites/item/thumb", "url/search", "panel/drag"]
MARKET_EXCEPTIONS = ["url/commercial", "site/commercial", "desctiption/commercial"]


def get_market_type_from_url(url):
    """Parses datasource based on domain. Valid only for market_offers incut."""
    return 'market-cpa' if 'market.yandex' in urlparse(url).netloc else 'market-cpc'


def get_datasource_from_url(url):
    """Parses datasource based on domain. Valid only for other_offers and similar_offers."""
    domain = urlparse(url).netloc
    if 'market-click2.yandex.' in domain:
        return 'market-cpc'
    if 'market.yandex' in domain:
        return 'market-cpa'
    return 'schemaorg'


def get_direct_parameters(path):
    """Parse datasource and placement for direct."""
    datasource = 'direct'
    placement = 'preview' if 'preview' in path else 'serp'
    return datasource, placement


def is_greenurl_exception(path):
    for token in GREENURL_EXCEPTIONS:
        if token in path:
            return True
    return False


def is_market_exception(path):
    for token in MARKET_EXCEPTIONS:
        if token in path:
            return True
    return False


def is_greenurl(event):
    path = event.convertedPath
    if '-action-type' in event.vars:
        return event.vars['-action-type'] == 'greenurl'

    if ACTION_PATTERNS["PATTERN_GREENURL"].search(path):
        return False if is_greenurl_exception(path) else True


class DataHolder(object):
    def __init__(self, record):
        self.showsDict = defaultdict(lambda: defaultdict(int))
        self.clicksDict = defaultdict(lambda: defaultdict(int))

        self.shownUrls = defaultdict(lambda: defaultdict(list))
        self.clickedUrls = defaultdict(lambda: defaultdict(list))

        self.uid = record['uid']
        self.reqid = record['ReqID']
        self.serpid = record['serpid']
        self.query = record['query']
        self.region = record['UserRegion']
        self.ui = record['ui']
        self.fielddate = record['fielddate']
        self.interior = float(record['SearchPropsValues'].get('IMAGES.ImgQueryFactors.QInteriorProbV1', '0'))
        self.qcpv3 = float(record['SearchPropsValues'].get('IMAGES.ImgQueryFactors.QCommercialProbV3', '0'))
        self.prism_cluster = record['SearchPropsValues'].get('UPPER.PrismBigBLog.prism_cluster', '')
        self.prism_segment = record['SearchPropsValues'].get('UPPER.PrismBigBLog.prism_segment', '')

    def update_shows(self, placement, key, increment):
        self.showsDict[placement][key] += increment

    def update_clicks(self, placement, key, increment):
        self.clicksDict[placement][key] += increment

    def update_shown_urls(self, placement, key, url):
        self.shownUrls[placement][key].append(url)

    def update_clicked_urls(self, placement, key, url):
        self.clickedUrls[placement][key].append(url)


class RequestCommercialData(object):

    def __init__(self, record):

        self.reqdata = DataHolder(record)
        self.blocks = record.get('blocks', [])
        self.events = record.get('events', [])

        self.PLACEMENTS = {'description': {'thumb-snippet'},
                           'polaroid': {'polaroid', 'polaroid-commercial-snippet'},
                           'preview': {'commercial-snippet', 'snippet'},
                           'incut': {'incut'},
                           'similar_offers': {'similar_offers'},
                           'other_offers': {'other_offers'},
                           'crop': {'crop'},
                           'serp': {'serp'}
                           }
        self.PLACEMENTS_REVERSED = {'thumb-snippet': 'description',
                                    'polaroid': 'polaroid',
                                    'polaroid-commercial-snippet': 'polaroid',
                                    'snippet': 'preview',
                                    'commercial-snippet': 'preview',
                                    'incut': 'incut',
                                    'similar_offers': 'similar_offers',
                                    'other_offers': 'other_offers',
                                    'crop': 'crop',
                                    'serp': 'serp'
                                    }
        self.DATASOURCES = {'organic': 'organic',
                            'turbo': 'turbo',
                            'market': 'market-cpc',
                            'market-cpa': 'market-cpa',
                            'schemaorg': 'schemaorg'}

        self.DATASOURCES_REVERSED = {'2': 'market-cpc',
                                     '256': 'schemaorg',
                                     '4096': 'turbo',
                                     '16384': 'market-cpa',
                                     '8192': 'market-cpc',
                                     '65536': 'market-cpa'}  # Just in case, currently not used
        self.TYPE = {'show', 'click'}
        self.SHOW_BLACKLIST = ['tags', 'scroll']
        self.clicks = []
        self.shows = []
        self.clicked_urls = []
        self.shown_urls = []

    def get_placement(self, event):
        path = event.convertedPath
        if '-click-source' in event.vars and event.vars['-click-source'] in self.PLACEMENTS_REVERSED:
            return self.PLACEMENTS_REVERSED[event.vars['-click-source']]
        else:
            if 'polaroid' in path:
                return 'polaroid'
            if 'commercial/crop' in path:
                return 'crop'
            if 'other_offers' in path:
                return 'other_offers'
            if 'similar_offers' in path:
                return 'similar_offers'
            if 'snippet/description' in path:
                return 'description'
            if 'preview' in path and (
                    'market' in path or 'schemaorg' in path or 'turbo' in path or 'direct' in path or 'duplicates/url' in path or 'preview/site' in path):
                return 'preview'
        return 'unknown'

    def get_datasource(self, event):
        path = event.convertedPath
        if '-data-source' in event.vars and event.vars['-data-source'] in self.DATASOURCES:
            return self.DATASOURCES[event.vars['-data-source']]
        else:
            if 'market-cpa' in path and not is_market_exception(path):
                return 'market-cpa'
            if 'market' in path and not is_market_exception(path):
                return 'market-cpc'
            if 'direct' in path:
                return 'direct'
            if 'schemaorg' in path:
                return 'schemaorg'
            if 'turbo' in path:
                return 'turbo'
            if 'duplicates/url' in path or 'preview/site' in path:
                return 'organic'
        return 'unknown'

    def parse_market_offers(self, block):
        """Parsing market offers shows with urls from blockstat counters. Datasource is defined by url"""
        offers_list = block.market_offers_incut_info.split('\t')

        for item in offers_list:
            url = item.split('=', 1)[-1]
            source = get_market_type_from_url(url)
            self.reqdata.update_shows('incut', source, 1)
            self.reqdata.update_shown_urls('incut', source, url)

    def parse_show_counter(self, event):
        """Parsing client show counters."""
        placement = 'unknown'
        path = event.convertedPath

        for token in self.SHOW_BLACKLIST:
            if token in path:
                return

        placement = self.get_placement(event)
        datasource = self.get_datasource(event)

        if 'direct' in path:
            datasource, placement = get_direct_parameters(path)

        url = ''
        if '-url' in event.vars:
            url = unquote(event.vars['-url'])
            self.reqdata.update_shown_urls(placement, datasource, url)
            self.reqdata.update_shows(placement, datasource, 1)
        elif '-urls' in event.vars:
            for url in event.vars['-urls'].split(';'):
                unqtd_url = unquote(url.split(':', 1)[-1])
                datasource = get_datasource_from_url(unqtd_url)
                self.reqdata.update_shown_urls(placement, datasource, unqtd_url)
                self.reqdata.update_shows(placement, datasource, 1)
        elif '-products' in event.vars:
            for item in event.vars['-products'].split(';'):
                parsed_info = item.split(':', 5)
                if len(parsed_info) == 6:
                    unqtd_url = unquote(parsed_info[-1])
                    datasource = parsed_info[1]
                else:
                    unqtd_url = ''
                self.reqdata.update_shown_urls(placement, datasource, unqtd_url)
                self.reqdata.update_shows(placement, datasource, 1)
        else:
            self.reqdata.update_shown_urls(placement, datasource, unquote(url))
            self.reqdata.update_shows(placement, datasource, 1)

    def parse_shows_from_blocks(self, blocks):
        for block in blocks:
            if 'serp/results/snippet/description' in block.bsPath:
                self.reqdata.update_shows('description', self.DATASOURCES[block.descriptionDataSource], 1)
                self.reqdata.update_shown_urls('description', self.DATASOURCES[block.descriptionDataSource],
                                               unquote(block.descriptionUrl))

            if 'market' in block.pos:
                self.parse_market_offers(block)

            if block.isShown:
                self.reqdata.update_shows('preview', 'organic', 1)
                self.reqdata.update_shown_urls('preview', 'organic', unquote(block.htmlUrl))

    def parse_click_counter(self, event):
        if 'direct' in event.convertedPath and ('thumb' in event.convertedPath or 'click' in event.convertedPath):
            datasource, placement = get_direct_parameters(event.convertedPath)
            url = ''  # Currently no valid way to get url, fix after frontend changes
            self.reqdata.update_clicks(placement, datasource, 1)
            self.reqdata.update_clicked_urls(placement, datasource, url)
        elif is_greenurl(event):

            placement = self.get_placement(event)
            url = ''
            if '-decoded-url' in event.vars:
                url = unquote(event.vars['-decoded-url'])
            elif hasattr(event, 'url'):
                url = unquote(event.url)

            datasource = self.get_datasource(event)
            if placement == 'incut':
                datasource = get_market_type_from_url(url)
            self.reqdata.update_clicks(placement, datasource, 1)
            self.reqdata.update_clicked_urls(placement, datasource, url)

    def extract_clicks_shows_info(self, blocks, events):
        # Getting shows on serp from blockstat-log
        self.parse_shows_from_blocks(blocks)

        # Getting shows in preview and clicks info from redir-log
        for event in events:
            # Parse show events in separate function
            if 'show' in event.convertedPath:
                self.parse_show_counter(event)
            # Parse click events in separate function
            else:
                self.parse_click_counter(event)

    def assemble_structs(self):
        for placement, details in self.reqdata.clicksDict.iteritems():
            for source, clicks in details.iteritems():
                self.clicks.append({"placement": placement,
                                    "datasource": source,
                                    "clicks": clicks})
        for placement, details in self.reqdata.showsDict.iteritems():
            for source, shows in details.iteritems():
                self.shows.append({"placement": placement,
                                   "datasource": source,
                                   "shows": shows})
        for placement, details in self.reqdata.shownUrls.iteritems():
            for source, urls in details.iteritems():
                self.shown_urls.append({"placement": placement,
                                        "datasource": source,
                                        "urls": urls})
        for placement, details in self.reqdata.clickedUrls.iteritems():
            for source, urls in details.iteritems():
                self.clicked_urls.append({"placement": placement,
                                          "datasource": source,
                                          "urls": urls})

    def collect_data(self):
        self.extract_clicks_shows_info(self.blocks, self.events)
        self.assemble_structs()


@with_hints(output_schema=dict(SCHEMA))
def parse_commercial(recs, main_query):
    for rec in recs:

        if rec is None:
            continue

        if rec['service'] != 'images':
            continue

        if float(rec['SearchPropsValues'].get('IMAGES.ImgQueryFactors.QCommercialProbV3', '0')) <= 0.52:
            continue

        parser = RequestCommercialData(rec)
        parser.collect_data()

        reqData = parser.reqdata

        res = {
            'uid': reqData.uid,  # yandexuid
            'reqid': reqData.reqid,  # request id
            'serpid': reqData.serpid,  # serp id
            'query': reqData.query,  # user query
            'ui': reqData.ui,  # desktop|mobile|touch|app|pad|other
            'region': reqData.region,  # region in number format
            'fielddate': reqData.fielddate,  # request date
            'interior': reqData.interior,  # IMAGES.ImgQueryFactors.QInteriorProbV1
            'qcpv3': reqData.qcpv3,  # IMAGES.ImgQueryFactors.QCommercialProbV3
            'prism_cluster': reqData.prism_cluster,
            'prism_segment': reqData.prism_segment,
            'shows': parser.shows,
            'clicks': parser.clicks,
            'shown_urls': parser.shown_urls,
            'clicked_urls': parser.clicked_urls,

        }
        main_query(Record(**res))


@cli.statinfra_job(
    options=[
        cli.Option('out_path', default=OUTPUT_PATH),
    ]
)
def make_job(job, options):
    job = job.env(
        yt_spec_defaults=dict(pool_trees=["physical"], use_default_tentative_pool_trees=True,
                              job_io={"table_writer": {"max_row_weight": 128 * 1024 * 1024}}),
        templates=dict(job_root=options.out_path, title='Images commercial cube'),
    )

    for date in options.dates:
        # to_insert = '' if not options.use_sample else 'sample_by_uid_1p/' -- строка нигде не используется

        input_table = '{}/{}'.format(INPUT_PATH, date)
        queries = job.table(input_table) \
            .map(parse_commercial) \
            .put('$job_root/{}'.format(date))

    return job


if __name__ == '__main__':
    cli.run()
