
import sys
import urllib
import random

from regexps_and_stuff import *


blockstat =  {  1034:'web3',
                1035:'tr',
                1049:'widget',
                1066:'theme',
                1119:'com',
                1191:'campslist',
                1192:'multiedit',
                1193:'camp',
                1201:'ajax',
                1276:'copy',
                1277:'search-by-image',
                1352:'favorites',
                15:'morda',
                153:'search',
                155:'direct',
                231:'video',
                277:'images',
                28:'ru',
                469:'similar',
                584:'touch',
                629:'index',
                80:'web',
                861:'other',
                875:'card',
                899:'plain',
                985:'market_test',
                1065:'ppb',
                1042:'ua',
                1116:'by',
                1117:'kz',
                1118:'com_tr',
                1033:'timing',
                1440:'worker',
                1667:'pad',
                2543:'granny'}

def parse_yt(value):
    fields = {}
    yandexuid = None
    ip = None
    timestamp = None
    for pair in value.split('\t'):
        kv = pair.split('=', 1)
        if kv[0] not in ('_stbx', '_logfeller_index_bucket'):
            if kv[0] == 'yandexuid':
                yandexuid = kv[1]
            elif kv[0] == 'ip':
                ip = kv[1]
            elif kv[0] == 'timestamp':
                timestamp = kv[1]
            else:
                fields[kv[0]] = kv[1]
    return yandexuid, ip, timestamp, fields

def redir_record_handler(rec):
    yandexuid, ip, timestamp, params = parse_yt(rec.value)

    record=user_sessions_redir_common(params,yandexuid,timestamp,ip,rec)
    if isinstance(record,dict):
        record['type']='redir'
    return record


def user_sessions_record_handler(rec):

    fields = rec.value.split('\t')
    yandexuid = rec.key
    timestamp = rec.subkey
    params = {}

    for f in fields:
        f = f.strip()
        if f == '': continue

        pos = f.find('=')

        if pos != -1:
            key=f[:pos]
            value=f[pos+1:]
            params[key]=value
        else:
            key=f
            params[key]=None
            if key!='url':
                print >> sys.stderr, "Strange field: '" + f + "'\nin record:" + rec.value


    if params['type']!='TECH':
        return "!TECH"

    record=user_sessions_redir_common(params,yandexuid,timestamp,None,rec)

    if isinstance(record,dict):
        record['type']='user_session'

    return record


def user_sessions_redir_common(params,yandexuid,timestamp,ip,rec):

    if params.get('path')=='690.1033':
        ajax = False
    elif params.get('path')=='690.1201':
        ajax = True
    else:
        return "!timing"

    vars_fields = params.get('vars')
    assert vars_fields is not None, "no vars param!"

    #HACK: some of the fields are inside @@url part, wtf. Get them from there.
    url_fields = params.get('url')
    if url_fields is not None and ',' in url_fields:
        url_fields = 'url='+params.get('url')
    else:
        url_fields=''

    #quote bug fix
    vars_fields =  vars_fields.replace('KHTML,%20like','KHTML%20like')

    #this is for broken counters in end-of-march-2013, beginning of april-2013
    vars_fields =  vars_fields.replace(',=','=')

    vars_fields=vars_fields.split(',')
    if url_fields:
        vars_fields+=url_fields.split(',')
    vars_params = {}

    for f in vars_fields:
        f = f.strip()
        if f == '': continue

        pos = f.find('=')

        if pos != -1:
            key=f[:pos]
            value=f[pos+1:]
            vars_params[key]=value
        else:
            key=f
            vars_params[key]=None
            print >> sys.stderr, "Strange var: '" + f + "'\nin field:" + params['vars'] + "'\nin record:" + rec.value


    #print >> sys.stderr, params
    #print >> sys.stderr, vars_params

    #params handling

    reqid           = params.get('reqid')
    dns             = vars_params.get('1037')
    tcp_handshake   = vars_params.get('1038')
    ttfb            = vars_params.get('1039')
    html            = vars_params.get('1040')
    html_total      = vars_params.get('1040.906')
    ttfp            = vars_params.get('1041')
    ttfp_total      = vars_params.get('1041.906')
    raw_service     = vars_params.get('143')
    visibility      = vars_params.get('1484')

    if visibility is None: visibility='visible'
    elif int(visibility)==1: visibility='visible'
    elif int(visibility)==2: visibility='hidden'
    elif int(visibility)==3: visibility='prerender'
    else: assert False, "wtf visibility is %s" % visibility


    if raw_service is not None and raw_service != '':
        new_raw_service=[]

        for p in raw_service.split('.'):
            if p == '':
                pp=''
            else:
                pp = blockstat.get(int(p))
            if pp is None: pp=p
            new_raw_service += [pp]

        raw_service='.'.join(new_raw_service)
    else:
        raw_service=None

    if reqid is None:
        #this is bug before 18 june 2014, lets generate arbitary reqid
        #check there is no bug later
        reqid = timestamp+"000000-"+str(random.SystemRandom().randint(0,2**63))+"00000-FAKE-REQID"

    limit = 1000000000 #we often got timstamp in this field instead of time diff.
                       #limit it by value by 1M seconds.

    if dns is not None:
        dns=int(float(dns))
        assert dns < limit, "dns is too big %s " % dns

    if tcp_handshake is not None:
        tcp_handshake=int(float(tcp_handshake))
        assert tcp_handshake < limit, "tcp_handshake is too big %s " % tcp_handshake

    if ttfb is not None:
        ttfb=int(float(ttfb))
        assert ttfb < limit, "ttfb is too big %s " % ttfb

    if html is not None:
        html=int(float(html))
        assert html < limit, "html is too big %s " % html

    if html_total is not None:
        html_total=int(float(html_total))
        assert html_total < limit, "html_total is too big %s " % html_total

    if ttfp is not None:
        ttfp=int(float(ttfp))
        assert ttfp < limit, "ttfp is too big %s " % ttfp

    if ttfp_total is not None:
        ttfp_total=int(float(ttfp_total))
        assert ttfp_total < limit, "ttfp_total is too big %s " % ttfp_total

    real_region     = vars_params.get('287')
    try:
        real_region = int(real_region)
    except:
        real_region = None

    referer        = params.get('referer')

    if referer is None:
        referer        = params.get('HTTP_REFERER')

    assert referer, 'some idiots called counter directly'

    https=None

    if referer[:7]=="http://":
        https=False
        referer=referer[7:]
    elif referer[:8]=="https://":
        https=True
        referer=referer[8:]

    pageno = None

# THIS PAGE FROM URL IS NOT RELIABLE
#    if referer is not None:
#        url_params = referer.split('&')
#        url_params = [p for p in url_params if p[:2]=="p=" ]
#
#        if len(url_params)==1:
#            pageno = int(url_params[0][2:])
#        elif len(url_params)>1:
#            print >> sys.stderr, "double page param " + rec.value


    if reqid is not None:
        reqid_parts = reqid.split('-')
        if len(reqid_parts)>=2:
            reqid_page = [rp for rp in reqid_parts if rp[:1]=='p']
            if len(reqid_page)==1:
                reqid_page=reqid_page[0]
                new_pageno=int(reqid_page[1:])
                assert pageno==None or pageno==new_pageno, "different pageno from param and reqid: " + rec.value
                pageno=new_pageno
            elif len(reqid_page)==0:
                if pageno == None:
                    pageno = 0
            else:
                print >> sys.stderr, "WTF strange reqid" + reqid
        else:
            #not search reqid
            pass

    if ip is None:
        ip = params.get('ip')

    ua              = vars_params.get('1042')
    #print >> sys.stderr, ua


    host_script = None
    if referer:
        host_script = referer.split('?')[0]


    #TODO: change for redir for yourself!!!!
    service         = params.get('service')
    ui              = params.get('ui')

    #if service is None and ui is None and host_script is not None:
    #    (service,ui) = get_service_from_url(host_script)

    tld = None

    if host_script is not None:
        tld=get_tld(host_script)

    #if tld is None:
        #print >> sys.stderr, "tld is none: %s\n%s\n\n" % (host_script,rec.value)

    if service is None and ui is None:
        service_ui=raw_service
        if service_ui is None:
            service_ui = host_script
    else:
        service_ui="%s-%s" % (service,ui)

    if ajax and service_ui:
        service_ui_ajax=service_ui.replace('.ajax','')
    else:
        service_ui_ajax=service_ui

    #real service name whish will be used
    service_by_service = update_one_service(service_ui_ajax)

    #override because of bug
    if host_script and '/searchapp' in host_script:
        service_by_service='searchapp'

    if service_by_service is None: #we skip market and direct for yet
        return service_ui_ajax.split('/')[0]

    assert ' ' not in service_by_service, "Strange service, %s" % service_by_service
    assert service_by_service.count('/')<=1, "Strange service, %s" % service_by_service

    #used only for checking url/service coherence
    service_by_url = update_one_service(host_script)


    if service_by_service != service_by_url and "ru.images.search-by-image"!=service_ui and "ru.images.similar"!=service_ui and None!=service_ui and 'direct' not in service_ui:
        if ('video.search.ajax' in service_ui and '/video/' in host_script) or ('images.search.ajax' in service_ui and ('/images/' in host_script or '/gorsel/' in host_script)):
            pass
        elif '.morda.touch' in service_ui and ('www.yandex.' in host_script or 'm.yandex.' in host_script) and host_script[-1]=='/':
            pass
        elif ('.morda.plain' in service_ui or '.morda.widget' in service_ui or '.morda.theme' in service_ui) and 'www.yandex.' in host_script and host_script[-1]=='/':
            pass
        else:
            print >> sys.stderr, "service_by_service != service_by_url: %s != %s\n%s" % (service_ui,host_script,rec.value)

    record = dict()

    record['reqid']=reqid
    record['ts']=int(timestamp)
    record['service']=service_by_service
    ua, ver = update_user_agent(ua)
    record['user_agent']=ua
    record['user_agent_version']=ver
    record['region']=real_region
    record['pageno']=pageno
    record['tld']=tld
    record['host_script']=host_script
    record['https']=https
    record['turbo']=is_turbo(ip)
    record['ajax']=ajax
    record['visibility']=visibility

    #stages
    record['.dns']=dns
    record['.tcp_handshake']=tcp_handshake
    record['.ttfb']=ttfb
    record['.html']=html
    record['.html_total']=html_total
    record['.ttfp']=ttfp
    record['.ttfp_total']=ttfp_total

    return record

