# -*- coding: UTF-8 -*-
"""The script collects data relevant for hotels analytics from user-sessions
'Relevant' are the requests where at least one of the following is true:
1) Old travel (hotel or tours) wizard is present
2) New travel 1Org or carousel are present
3) 'UPPER.Dumper.FirstGeoDocRubricId' is one of the 7 rubrics (TRAVEL_RUBRICS)
4) The organic web results contain some relevant top-level domains (see get_hotel_urls function)
5) The value "UPPER.OrgWizard.CarouselClickedOrg" is not empty (normally, is can be non-empty
only when carousel is also shown)
"""
import argparse
import os
import traceback
import datetime as dt
from collections import defaultdict

import nile
from nile.api.v1 import (
    clusters,
    extractors as ne,
    aggregators as na,
    filters as nf,
    Record
)



from qb2.api.v1 import (
    filters as sf,
)

directory = os.environ['JWD']
os.sys.path.append(directory)
from common import TRAVEL_RUBRICS, WizardType, WIZARDS

STATBOX_FILES = [
    nile.files.LocalFile(os.path.join(directory, 'common.py')),
    nile.files.RemoteFile('statbox/statbox-dict-last/blockstat.dict'),
    nile.files.RemoteFile('statbox/resources/libra.so')
]


COMPANY_TO_RUBRIC = '/home/altay/db/export/current-state/snapshot/company_to_rubric'
COMPANY_TO_PROVIDER = '//home/altay/db/export/current-state/snapshot/company_to_provider'
DEFAULT_TARGET_DIR = 'home/travel/analytics/user_sessions'

SEARCH_PROPS = {
    'UPPER.Dumper.FirstGeoDocRubricId': 'FirstRubricId',
    'UPPER.OrgWizard.DisplayMode': 'DisplayMode',
    'UPPER.OrgWizard.GeoKind': 'GeoKind',
    'UPPER.OrgWizard.HasYandexTravel': 'HasTravel',
    'UPPER.OrgWizard.HasYandexTravelTop1': 'HasTravelTop1',
    'UPPER.OrgWizard.QueryType': 'QueryType',
    'UPPER.OrgWizard.Rubrics': 'Rubrics',
    'UPPER.OrgWizard.TravelClassifierValue': 'TravelClassifierValue',
    'UPPER.OrgWizard.SelectedSource': 'SelectedSource'
}

CAROUSEL_CLICKED_ORG = "UPPER.OrgWizard.CarouselClickedOrg"
SOURCE_ID_PAIRS = 'UPPER.OrgWizard.SourceIdPairs'
WIZARD_SOURCES = "UPPER.OrgWizard.WizardSources"



def get_wizard_type(wizard_path, wizards_list=WIZARDS):
    """Get the wizard type and meaningful suffix from converted path"""
    if wizard_path is None:
        return None, wizard_path
    for prefix, wizard_type in wizards_list.iteritems():
        if wizard_path.startswith(prefix):
            return wizard_type, wizard_path.replace(prefix, "")
    return None, wizard_path


def get_val_from_vars(item, val="subtype"):
    """Get the second value from the pair iterable
       (we could have converted it to a dict and then get the value
       by key)"""
    for key, value in item.GetVars():
        if key == "-" + val:
            return value


def get_blocks(request):
    """Get some information on the travel 1Org and carousel and just
       True for other wizards"""
    results = defaultdict(list)
    travel = False
    bk = False
    for block in request.GetBSBlocks():
        block_path = block.Path
        wizard_type, path = get_wizard_type(block_path)
        if wizard_type == WizardType.OneOrg and path.startswith('/form'):
            travel = True
	#if wizard_type == WizardType.Carousel and path.startswith('/tabs/list'):
	#    bk = True
        if wizard_type is None:
            continue
        elif wizard_type in (WizardType.Carousel, WizardType.TravelOneOrg, WizardType.OneOrg, WizardType.TravelOneOrgCompany, WizardType.TravelOrgMn, WizardType.TravelOneOrgRight):
            results[wizard_type].append({"path": path,
                                         "subtype": get_val_from_vars(block),
					 "price": get_val_from_vars(block, val="price"),
					 "permalink": get_val_from_vars(block, val="oid"),
					 "pos": get_val_from_vars(block, val="item")})
        else:
            results[wizard_type] = True
    if travel == True:
        results[WizardType.TravelOneOrgCompany] = results.pop(WizardType.OneOrg)
    #if bk == True:
    #    results[WizardType.BK] = results.pop(WizardType.Carousel)
    return results


def get_device(request):
    """Get device for search SERPS and None for everything else"""
    classes = {"TTouchYandexWebRequest": "touch",
               "TYandexWebRequest": "desktop",
               "TPadYandexWebRequest": "pad",
               "TMobileAppYandexWebRequest": "app"
              }
    for key, value in classes.items():
        if request.IsA(key):
            return value


def get_hotel_urls(request):
    """Get urls on the following top-level domains, including 'www.<bla-bla-bla.>our_domain.ru'"""
    hosts = {"booking.com", "101hotels.ru", "ostrovok.ru", "vashotel.ru", "hotel24.ru",
             "tophotels.ru", "hotels.com", "hotellook.ru", "hotels.ru", "mirturbaz.ru",
             "domotdiha.ru", "ruhotel.su", "1001tur.ru", "hotels.ru", "hochu-na-yuga.ru",
             "turpravda.com", "ughotels.ru", "travelata.ru", "posrednikov-net.com", "putevka.com",
	     "nakubani.ru", "turvopros.com", "tez-tour.com", "katalogturbaz.ru"}
    urls = []
    for block in request.GetMainBlocks():
        result = block.GetMainResult()
        if hasattr(result, "Url"):
            url = result.Url
            if not url:
                continue
            main_part = url.split("//")[1]
            top_domain = ".".join(main_part.split(
                '/')[0].split('.')[-2:]).lower()
            if top_domain == "tripadvisor.ru" and \
		url.split("//")[1].split('/')[1].lower().startswith("hotel"):
                urls.append(url)
                continue
            if top_domain in hosts:
                urls.append(url)
    return urls


def get_clicks(request):
    """Get information on all the clicks"""
    clicks = []
    for click in request.GetClicks():
        click = dict(path=click.ConvertedPath,
                     delay=click.DelayAfterRequest,
                     url=click.Url,
                     is_dynamic=click.IsDynamic,
                     direct=bool(click.GetProperties('TDirectClickProperties')),
                     dwelltime=click.DwellTimeOnService,
		     item_n=get_val_from_vars(click, val="item"),
		     permalink=get_val_from_vars(click, val="oid"),
                    )
        clicks.append(click)
    return clicks


def get_search_props_val(request, val):
    """Get the specific search props value or ''"""
    if hasattr(request, "SearchPropsValues"):
        return request.SearchPropsValues.get(val, "")
    return ""


def get_search_props_vals(request):
    """Get the dictionary of specific SEARCH_PROPS values"""
    search_props = {}
    if hasattr(request, "SearchPropsValues"):
        for key, val in SEARCH_PROPS.items():
            if key in request.SearchPropsValues:
                search_props[val] = request.SearchPropsValues[key]
    return search_props


def get_entity_info(request):
    """Get the dictionary with ontoid and otype (no matter if
       the object was shown or banned"""
    if not hasattr(request, "SearchPropsValues"):
        return None
    log_data_items = request.SearchPropsValues.get(
        "UPPER.EntitySearch.Log", "").split("|")
    otype = ontoid = ""
    if log_data_items[0] == "1":
        otype, ontoid, ontoid_from_with_type = log_data_items[2:5]
        if not ontoid and ontoid_from_with_type.startswith("lst"):
            ontoid = ontoid_from_with_type
        return {"ontoid": ontoid, "otype": otype}
    else:
        return None


def int_monad(val):
    if val != "":
        return int(val)


def handle_args():
    yesterday = (dt.datetime.today() - dt.timedelta(1)).date()
    today = dt.datetime.now().date()
    parser = argparse.ArgumentParser(add_help=True)
    parser.add_argument('--job_root', help='yt path to tables',
                        default=DEFAULT_TARGET_DIR)
    parser.add_argument("--init_date")
    parser.add_argument("--final_date", help="not included")
    parser.add_argument('--memory_limit', type=int, default=5000)
    parser.add_argument('--fast', type=int)
    args = parser.parse_args()
    args.init_date = dt.datetime.strptime(
        args.init_date, "%Y-%m-%d").date() if args.init_date else yesterday
    args.final_date = dt.datetime.strptime(
        args.final_date, "%Y-%m-%d").date() if args.final_date else today
    return args


def daterange(start_date, end_date):
    for idx in xrange(int((end_date - start_date).days)):
        yield start_date + dt.timedelta(idx)


def parse_session(records, uid):
    import libra
    try:
        session = libra.ParseSession(records, 'blockstat.dict')
        for request in session:
            device = get_device(request)
            if not device:
                continue

            blocks = get_blocks(request)
            hotel_urls = get_hotel_urls(request)
            search_props = get_search_props_vals(request)
            carousel_clicked_org = int_monad(
                get_search_props_val(request, CAROUSEL_CLICKED_ORG)) or 0

            if hotel_urls or carousel_clicked_org or\
                (search_props.get("FirstRubricId") in TRAVEL_RUBRICS and \
                (WizardType.OneOrg in blocks or WizardType.OrgMn in blocks)) or \
                WizardType.Carousel in blocks or \
                WizardType.TravelOneOrg in blocks or \
		WizardType.TravelOneOrgCompany in blocks or \
		WizardType.TravelOneOrgRight in blocks or \
		WizardType.TravelOrgMn in blocks or \
                    WizardType.OldTours in blocks:
                yield True, Record(
                    query=request.Query,
                    reqid=request.ReqID,
                    parent_reqid=request.WebParentReqId,
                    full_request=request.FullRequest,
                    passport_uid=request.PassportUID or "",
                    uid=uid,

                    device=device,
                    region=request.ServiceDomRegion if hasattr(
                        request, "ServiceDomRegion") else None,
                    user_region=request.UserRegion if hasattr(
                        request, "UserRegion") else None,
                    test_ids=[elem.TestID for elem in request.GetTestInfo()],
                    time_isoformatted=dt.datetime.fromtimestamp(
                        request.Timestamp).isoformat(),
                    timestamp=request.Timestamp,

                    entity_search=get_entity_info(request),
                    search_props=search_props,
                    hotel_urls=hotel_urls,
                    carousel_clicked_org=carousel_clicked_org,
                    permalinks=[int(elem) for elem in get_search_props_val(
                        request, SOURCE_ID_PAIRS).split() if elem.isdigit() and elem != "0"],
                    blocks=blocks,
                    clicks=get_clicks(request),
		    sources=get_search_props_val(request, WIZARD_SOURCES))
    except RuntimeError:
        pass
    except Exception:
        yield False, Record(uid=uid,
                            error=traceback.format_exc())


def reducer(groups, results, errors):
    for key, records in groups:
        uid = key.key
        for success, result in parse_session(records, uid):
            if success:
                results(result)
            else:
                errors(result)


def main():
    args = handle_args()

    for date in daterange(args.init_date, args.final_date):
        date_string = date.isoformat()

        cluster = clusters.Hahn(token=os.environ['YT_TOKEN']).env(
	     yt_spec_defaults=dict(
     			   pool_trees=["physical"],
        		   tentative_pool_trees=["cloud"]
   				 ),
            templates=dict(
                job_root=args.job_root,
                date=date_string,
                tmp_files='{}/nile_tmp_files'.format(args.job_root),
                tmp_root='{}/nile_tmp_root'.format(args.job_root)
            )
        )

        fast = args.fast

        # reqs_table = '$job_root/raw/{}'.format(fast or date_string)
        error_table = '$job_root/errors/{}'.format(fast or date_string)

        with cluster.driver.transaction():

            job = cluster.job()
            if fast:
                 sessions = job.table(
                    "//user_sessions/pub/search/fast/{}/clean".format(fast))
            else:
                  sessions = job.table(
                    '//user_sessions/pub/search/daily/$date/clean')

	    reqs, errors = sessions.groupby("key").sort("subkey").reduce(reducer,
										 files=STATBOX_FILES,
										 memory_limit=args.memory_limit
										)

	    # reqs.put(reqs_table)
            errors.put(error_table)


	    travel_objects = job.table(COMPANY_TO_RUBRIC)\
		    .filter(sf.one_of("rubric_permalink", TRAVEL_RUBRICS))\
		    .groupby("company_permalink")\
		    .aggregate(main_rubric=na.last("rubric_permalink", by="is_main"))\
		    .project("main_rubric",
			     permalink="company_permalink",
			     has_travel_rubric=ne.const(True))\

	    permalink_to_hotels = job.table(COMPANY_TO_PROVIDER)\
		    .filter(nf.equals("provider_permalink", "yandex_travel"))\
		    .project(permalink="company_permalink",
			     travel_id="original_id",
			     has_travel_permalink=ne.const(True))\
		    .unique("permalink")

	    reqs\
		    .project(ne.all(),
			     permalink=ne.custom(lambda permalinks, clicked: clicked or (permalinks[0] if len(permalinks) > 0 else None), "permalinks", "carousel_clicked_org"))\
		    .join(travel_objects, by="permalink", type="left")\
		    .join(permalink_to_hotels, by="permalink", type="left")\
		    .project(ne.all(),
			     has_travel_permalink=ne.custom(
				 lambda x: x or False, "has_travel_permalink"),
			     has_travel_rubric=ne.custom(
				 lambda x: x or False, "has_travel_rubric"))\
		    .put('$job_root/{}'.format(fast or date_string))

            job.run()


if __name__ == "__main__":
    main()
