from nile.api.v1 import (
    Record,
    files,
    clusters,
    with_hints,
    filters as nf,
    aggregators as na,
    extractors as ne
)
import os
import re

def get_service(r):
    service = ''
    if r.IsA('TWebRequestProperties'):
        service = 'web'
    elif r.IsA('TImagesRequestProperties'):
        service = 'images'
    elif r.IsA('TVideoRequestProperties'):
        service = 'video'
    elif r.IsA('TCbirRequestProperties'):
        service = 'cbir'
    elif r.IsA('TMapsRequestProperties'):
        service = 'maps'
    elif r.IsA('TPortalRequestProperties'):
        service = 'portal'
    elif r.IsA('TNewsRequestProperties'):
        service = 'news'
    elif r.IsA('TYandexCollectionsRequestProperties'):
        service = 'collections'

    ui = ''
    if r.IsA('TDesktopUIProperties'):
        ui = 'desktop'
    elif r.IsA('TTouchUIProperties'):
        ui = 'touch'
    elif r.IsA('TMobileUIProperties'):
        ui = 'mobile'
    elif r.IsA('TMobileAppUIProperties'):
        ui = 'mobileapp'
    elif r.IsA('TPadUIProperties'):
        ui = 'pad'
    elif r.IsA('TSiteSearchUIProperties'):
        ui = 'sitesearch'

    return service, ui


def parse_sessions(sessions):
    import libra
    for key, records in sessions:
        uid = key.key

        try:
            session = libra.ParseSession(records, 'blockstat.dict')
        except Exception as e:
            continue

        for r in session:
            if hasattr(r, 'PageNo') and r.PageNo != 0:
                continue

            if not hasattr(r, 'SearchPropsValues'):
                continue
            blender_props_arr = r.SearchPropsValues.get('UPPER.ApplyImagesBlender.fmls', '').split('|')
            blender_props = {kv[0]: kv[1] for kv in [ kv_str.split(':') for kv_str in blender_props_arr if ':' in kv_str]}
            service, platform = get_service(r)
            yield Record(query=r.Query,
                         service=service,
                         platform=platform,
                         domain=r.ServiceDomRegion.upper(),
                         region_id=r.UserRegion,
                         fresh_intent=float(blender_props.get('IMAGESQUICK', -1.0)),
                         wizard_position=int(r.SearchPropsValues.get('UPPER.ApplyBlender.IntentPos/WIZIMAGES', 9999))
                        )

def filter_intents(service, fresh_intent, wizard_position):
    if service != 'web' and fresh_intent > 0.3844 or service == 'web' and wizard_position <= 9:
        return True
    return False


def norm_query(query):
    return re.sub(r'[\.,!/\? -]', '', query.lower())


def aggregate_queries(groups):
    for key, records in groups:
        cnt = 0
        cur_record = None
        for r in records:
            cur_record = r
            cnt += 1
        if cur_record:
            yield Record(r, frequency=cnt)


def process_blender_intent(images_path, web_path, token, table_date, table_prefix):
    cluster = clusters.yt.Hahn(token=token)
    job = cluster.job().env(package_paths=[os.path.dirname(__file__)],
                            packages=[__name__])
    log_images = job.table(images_path)
    log_web = job.table(web_path)
    out_table_path = table_prefix + '/' + table_date

    log_images.concat(log_web).groupby('key').sort('subkey') \
        .reduce(
            parse_sessions,
            memory_limit=3*1024,
            files=[
                files.RemoteFile('//statbox/resources/libra.so'),
                files.RemoteFile('//statbox/statbox-dict-last/blockstat.dict')
            ]
        )\
        .filter(nf.custom(filter_intents, 'service', 'fresh_intent', 'wizard_position')) \
        .project(ne.all(), norm_query=ne.custom(norm_query, 'query')) \
        .groupby('service', 'platform', 'domain', 'norm_query') \
        .aggregate(query=na.any('query'),
                   region_id=na.any('region_id'),
                   fresh_intent=na.mean('fresh_intent'),
                   wizard_position=na.mean('wizard_position'),
                   frequency=na.count()) \
        .put(out_table_path)

    job.run()

    return [{'cluster': 'hahn', 'table': out_table_path}]
