# -*- coding: utf-8 -*-

from itertools import combinations
#from functools import partial
from copy import copy
import datetime as dt

from nile.api.v1 import (
    extractors as ne,
    aggregators as na,
    filters as nf,
    statface as ns,
    Record,
    cli
)

from nile.api.v1 import (
    with_hints, extended_schema, renamed_schema, modified_schema
)

from qb2.api.v1 import (
    extractors as qe,
    filters as qf,
    typing as qt
)

REPORT_YAML_CONFIG = "features.yaml"
COMPANY = '//home/altay/db/export/current-state/snapshot/company'
COMPANY_TO_RUBRIC = '//home/altay/db/export/current-state/snapshot/company_to_rubric'
COMPANY_TO_PROVIDER = '//home/altay/db/export/current-state/snapshot/company_to_provider'
COMPANY_TO_FEATURE = '//home/altay/db/export/current-state/snapshot/company_to_feature'
COMPANY_TO_DUPLICATE = '//home/altay/db/export/current-state/snapshot/company_to_duplicate'
ENUM_VALUES = "//home/altay/db/export/current-state/snapshot/feature_enum_value"
PROVIDERS = '//home/altay/db/export/current-state/snapshot/provider'
POPULARITY = '//home/sprav/assay/common/Popularity'
GOLDEN_PERMALINKS = '//home/travel/fminlos/golden_permalinks'
FEATURES = "//home/altay/db/export/current-state/snapshot/feature"

FIELDS = ("feature_name", "provider", "where")
MORE_FIELDS = ("address", "name")

@with_hints(output_schema=extended_schema())
def add_totals(records):
    """The mapper should me curried before use"""
    fields = FIELDS
    special_cases={"rubric": 0}
    for rec in records:
        proto_result = rec.to_dict()
        for n in range(len(fields) + 1):
            for combination in combinations(fields, n):
                result = copy(proto_result)
                for key in result.keys():
                    if key in combination:
                        if key in special_cases:
                            result[key] = special_cases[key]
                        else:
                            result[key] = "total"
                yield Record(**result)


@with_hints(output_schema=dict(original_id=str, bool_val=qt.Optional[qt.Bool], enum_val=str, int_val=int,feature_name=str, provider=str, where=str))
def feature_map(records, fields=MORE_FIELDS):
    for rec in records:
        original_id = rec.original_id
	features = rec.get("feature", []) or rec.get("_feature", [])
	features = features or []
        for feat in features:
            result = dict(original_id=original_id,
                          bool_val=None,
                          enum_val="",
                          int_val=-1,
                          feature_name=feat.get("id"),
                          provider=rec.provider,
			  where=rec.where)
            value = feat.get("value")
            if isinstance(value, bool):
                result["bool_val"] = value
            elif isinstance(value, int):
                result["int_val"] = value
            else:
                result["enum_val"] = feat.get("enum_id")
            yield Record(**result)
        for field in fields:
            yield Record(original_id=original_id,
                         feature_name=field,
                         field_val=rec.get(field, ""),
                         provider=rec.provider,
			 where=rec.where,
                         int_val=-1,
                         enum_val="",
                         bool_val=None)


#@with_hints(output_schema=dict(enum_value_id=int, provider_ids=[int], permalink=int, feature_name=str))
@with_hints(output_schema=modified_schema(extend=dict(enum_value_id=int), exclude=["enum_values", "value"]))
def flatten(records):
    for rec in records:
        for val in rec.enum_values:
            yield Record(enum_value_id=val,
                         provider_ids=rec.provider_ids,
                         permalink=rec.permalink,
                         feature_name=rec.feature_name)


def null2zero(val):
    return val or 0


def if_equals(x, y):
    if x is None and y is None:
        return None
    else:
        return x == y


def how_close(x, y):
    if x is None and y is None:
        return None
    x = set(x) if hasattr(x, "__iter__") else set()
    y = set(y) if hasattr(y, "__iter__") else set()
    union_len = len(x.union(y))
    inter_len = len(x.intersection(y))
    return float(inter_len)/union_len


def safe_percent(x, y):
    if not x and not y:
        return 0.
    else:
        return 100.*x/(x+y)


def get_value(record, value_field="value", subvalue_field="lang", subvalue_value="RU"):
    if record:
        return [elem.get(value_field, "") for elem in record if  elem.get(subvalue_field) == subvalue_value]
    else:
        return []


def take_first(vals):
    return vals[0] if vals else ""


@with_hints(output_schema=dict(feature_name=str, sprav_field_val=str, permalink=int))
def fields_to_features(records, fields=MORE_FIELDS):
    for rec in records:
        for field in fields:
            yield Record(feature_name=field,
                         sprav_field_val=rec.get(field, ""),
			 permalink=rec.permalink)


@cli.statinfra_job(
    options=[cli.Option('provider', default='ostrovok')]
)
def make_job(job, options, nirvana, statface_client):
    """Standart function according to Statistics conventions,
    see https://clubs.at.yandex-team.ru/statistics/1143"""

    golden = False

    if golden == True:
        REPORT_TITLE = u'Метрики золотой базы: фичи'
        REPORT_PATH = "Adhoc/Hotels/Golden_hotels_features"
        DEFAULT_DIR = "home/travel/analytics/golden_hotels_features"
    else:
        REPORT_TITLE = u'Метрики базы: фичи'
        REPORT_PATH = "Adhoc/Hotels/Hotel_features"
        DEFAULT_DIR = "home/travel/analytics/hotel_features"

    report = ns.StatfaceReport() \
        .from_yaml_config(REPORT_YAML_CONFIG)\
        .path(REPORT_PATH)\
        .title(REPORT_TITLE.encode('utf8'))\
        .scale("daily")\
        .client(statface_client)

    today = dt.datetime.now().date().isoformat()

    job_root = nirvana.directories[0] if nirvana.directories else DEFAULT_DIR

    job = job.env(
        default_memory_limit=2048,
        templates=dict(job_root="/".join([job_root, today]),
                       )
    )

    golden_permalinks = job.table(GOLDEN_PERMALINKS)

    popularity = job.table(POPULARITY)

    enum_values = job.table(ENUM_VALUES)\
        .project(sprav_enum_val="internal_name",
                 enum_value_id="id")

    binary_features = job.table(FEATURES)\
        .filter(nf.equals("value_type", "logical_value"))\
        .project(feature_name="internal_name")

    duplicate_permalinks = job.table(COMPANY_TO_DUPLICATE)\
	.project(permalink="duplicate_permalink")

    company_to_feature = job.table(COMPANY_TO_FEATURE)\
        .project("provider_ids",
                 "enum_values",
                 "value",
                 permalink="company_permalink",
                 feature_name="feature_permalink",
                 )

    if golden is True:
        company_to_feature = company_to_feature\
            .join(golden_permalinks, by="permalink")

    company_to_binary_feature = company_to_feature\
        .filter(qf.defined("value"))\
        .join(binary_features, by="feature_name")\
        .project("permalink",
                 "feature_name",
                 "provider_ids",
                 sprav_bool_val=ne.custom(lambda x: bool(int(x)), "value").add_hints(type=bool))\

    company_to_enum_feature = company_to_feature\
        .filter(qf.defined("enum_values"))\
        .map(flatten)\
        .join(enum_values, by="enum_value_id")\
        .project("permalink",
                 "provider_ids",
                 "feature_name",
                 "sprav_enum_val")\
        .groupby("permalink", "feature_name")\
        .aggregate(provider_ids=na.any("provider_ids"),
                   sprav_enum_vals=na.distinct("sprav_enum_val"))

    company_fields = job.table(COMPANY)

    if golden is True:
        company_fields = company_fields\
            .join(golden_permalinks, by='permalink')

    company_fields = company_fields\
        .project("permalink",
                 address=ne.custom(lambda x: x.get("formatted", {}).get(
                     "value") if x else "", "address").add_hints(type=str),
                 name=ne.custom(lambda elems: take_first([elem["value"]["value"] for elem in elems if elem["value"]["locale"] == "ru" and elem.get("type") == "main"]) if elems else "", "names").add_hints(type=str))\
        .map(fields_to_features)

    sprav_features = job.concat(
        company_to_binary_feature, company_to_enum_feature, company_fields)\
        .put('$job_root/sprav_features')

    providers = ("ostrovok", "expedia", "booking", "travelline", "hotelscombined", "101hotels")
    provider_names = ["ytravel_{}".format(provider) for provider in providers]

    sprav = job.table(COMPANY_TO_PROVIDER)\
        .project("original_id",
                 "provider_permalink",
                 permalink="company_permalink")\
        .filter(qf.one_of("provider_permalink", provider_names))\
	.join(duplicate_permalinks, by="permalink", type="left_only")

    if golden is True:
       sprav = sprav\
         .join(golden_permalinks, by="permalink")

    sprav = sprav\
        .join(sprav_features, by="permalink")\
        .join(popularity, by="permalink")\
        .groupby("provider_permalink", "original_id", "feature_name")\
        .aggregate(permalink_list=na.distinct("permalink"),
                   sprav_bool_val=na.any("sprav_bool_val"),
                   sprav_enum_vals=na.any("sprav_enum_vals"),
		   sprav_field_val=na.any("sprav_field_val"),
                   provider_ids=na.any("provider_ids"),
                   popularity=na.max("popularity"))\
        .project(ne.all(),
                 provider=ne.custom(lambda x: x.split(
                     "_")[-1], "provider_permalink").add_hints(type=str),
                 fielddate=ne.const(today),
                 permalinks=ne.custom(lambda x: ",".join(map(str, sorted(x))), "permalink_list").add_hints(type=str))\
        .put('$job_root/sprav')


    wheres = ("russia", "world")
    tables = [job.table("//home/travel/prod/feeds/{provider}/latest/parsed/{where}_hotels".format(provider=provider, where=where))
              for provider in providers for where in wheres]

    feeds = job.concat(*tables)\
        .project("feature",
		"_feature",
		 table_name=qe.table_path("feature").add_hints(type=str),
                 original_id="originalId",
                 address=ne.custom(lambda x: take_first(
                     get_value(x, value_field="one_line")) or take_first(get_value(x, subvalue_value=None)), "address").add_hints(type=str),
                 name=ne.custom(lambda x: take_first(get_value(x)) or take_first(get_value(x, subvalue_value=None)), "name").add_hints(type=str),
                 url=ne.custom(lambda x: take_first(
                     get_value(x, subvalue_field="type", subvalue_value="BOOKING")), "url").add_hints(type=str)
                 )\
        .project(ne.all(),
		 where=ne.custom(lambda x: x.split("/")[-1], "table_name").add_hints(type=str),
                 provider=ne.custom(lambda x: x.split("/")[-4] if len(x.split("/"))>3 else "unknown", "table_name").add_hints(type=str))\
        .map(feature_map)\
        .put('$job_root/feeds')


    #.filter(qf.defined("bool_val"))\

    feed_bool_features = feeds\
	.filter(nf.custom(lambda x: x is not None, "bool_val"))\
        .project("original_id",
                 "feature_name",
                 "provider",
                 "bool_val",
		 "where")

    feed_enum_features = feeds\
        .filter(nf.not_(nf.equals("enum_val", "")))\
        .groupby("original_id", "feature_name", "provider", "where")\
        .aggregate(enum_vals=na.distinct("enum_val"))

    feed_field_features = feeds\
        .filter(qf.defined("field_val"))

    feed_vals = job.concat(feed_bool_features,
                            feed_enum_features,
                            feed_field_features)

    sprav_and_feed = sprav\
        .join(feed_vals, by=("original_id", "feature_name", "provider"))\
        .project(ne.all(),
		 bool_match=ne.custom(if_equals, "bool_val", "sprav_bool_val").add_hints(type=bool),
                 enum_match=ne.custom(
                     how_close, "enum_vals", "sprav_enum_vals").add_hints(type=bool),
                 field_match=ne.custom(if_equals, "field_val", "sprav_field_val").add_hints(type=bool))\
        .put('$job_root/sprav_with_feeds')

    sprav_and_feed\
        .map(add_totals)\
        .groupby("fielddate", *FIELDS)\
        .aggregate(bool_match_count=na.count(predicate=nf.equals("bool_match", True)),
                   bool_non_match_count=na.count(
                       predicate=nf.equals("bool_match", False)),
                   bool_match_popularity=na.sum(
                       "popularity", predicate=nf.equals("bool_match", True)),
                   bool_non_match_popularity=na.sum(
                       "popularity", predicate=nf.equals("bool_match", False)),
                   enum_match_median=na.median("enum_match",
                                               predicate=nf.not_(nf.equals("enum_match", None))),
                   enum_match_count=na.count(
                       predicate=nf.equals("enum_match", 1)),
                   enum_non_match_count=na.count(
                       predicate=nf.custom(lambda x: x is not None and x <1 , "enum_match")),
                   enum_partial_match_count=na.count(predicate=nf.custom(
                       lambda x: x > 0 and x < 1, "enum_match")),
                   enum_match_popularity=na.sum(
                       "popularity", predicate=nf.equals("enum_match", 1)),
                   enum_non_match_popularity=na.sum(
                       "popularity", predicate=nf.custom(lambda x: x is not None and x < 1, "enum_match")),
                   enum_partial_match_popularity=na.sum(
                       "popularity", predicate=nf.custom(lambda x: x > 0 and x < 1, "enum_match")),
                   field_match_count=na.count(
                       predicate=nf.equals("field_match", True)),
                   field_non_match_count=na.count(
                       predicate=nf.equals("field_match", False)),
                   field_match_popularity=na.sum(
                       "popularity", predicate=nf.equals("field_match", True)),
                   field_non_match_popularity=na.sum(
                       "popularity", predicate=nf.equals("field_match", False)),

                   )\
        .project(ne.all(),
                 bool_match_count=ne.custom(null2zero, "bool_match_count").add_hints(type=int),
                 bool_non_match_count=ne.custom(
                     null2zero, "bool_non_match_count").add_hints(type=int),
                 bool_match_popularity=ne.custom(
                     null2zero, "bool_match_popularity").add_hints(type=float),
                 bool_non_match_popularity=ne.custom(
                     null2zero, "bool_non_match_popularity").add_hints(type=float),
                 enum_match_count=ne.custom(null2zero, "enum_match_count").add_hints(type=int),
                 enum_non_match_count=ne.custom(
                     null2zero, "enum_non_match_count").add_hints(type=int),
                 enum_match_popularity=ne.custom(
                     null2zero, "enum_match_popularity").add_hints(type=float),
                 enum_non_match_popularity=ne.custom(
                     null2zero, "enum_non_match_popularity").add_hints(type=float),
                 field_match_count=ne.custom(null2zero, "field_match_count").add_hints(type=int),
                 field_non_match_count=ne.custom(
                     null2zero, "field_non_match_count").add_hints(type=int),
                 field_match_popularity=ne.custom(
                     null2zero, "field_match_popularity").add_hints(type=float),
                 field_non_match_popularity=ne.custom(
                     null2zero, "field_non_match_popularity").add_hints(type=float)
                 )\
        .project(ne.all(),
                 bool_non_match_percent=ne.custom(
                     safe_percent, "bool_non_match_count", "bool_match_count").add_hints(type=float),
                 enum_non_match_percent=ne.custom(
                     safe_percent, "enum_non_match_count", "enum_match_count").add_hints(type=float),
                 bool_non_match_popularity_percent=ne.custom(
                     safe_percent, "bool_non_match_popularity", "bool_match_popularity").add_hints(type=float),
                 enum_non_match_popularity_percent=ne.custom(
                     safe_percent, "enum_non_match_popularity", "enum_match_popularity").add_hints(type=float),
                 field_non_match_percent=ne.custom(
                     safe_percent, "field_non_match_count", "field_match_count").add_hints(type=float),
                 field_non_match_popularity_percent=ne.custom(
                     safe_percent, "field_non_match_popularity", "field_match_popularity").add_hints(type=float),
                 )\
        .put('$job_root/sprav_feeds_aggregation')\
        .publish(report,  remote_mode=True, allow_change_job=True)

    sprav_without_feed = sprav\
        .join(feed_vals, by=("original_id", "feature_name", "provider"), type="left_only")

    feed_without_sprav = sprav\
        .join(feed_vals, by=("original_id", "feature_name", "provider"), type="right_only")\
        .put('$job_root/feeds_without_sprav')

    return job


if __name__ == "__main__":
    cli.run()
