uthor__ = 'kosotis'

from collections import defaultdict
from datetime import datetime
import urllib
import random
import sys
import os

from mapreducelib import MapReduce, Record, MapReduceClient
from parselib import DatesRange
import libra
import uatraits


def get_text(request):  
    index = request.find('text=')
    if index == -1:
        return None
    clid_part = request[index + 5:]
    index = clid_part.find('&')
    if index == -1:
        index = clid_part.find('\t')
        if index == -1:
            index = clid_part.find(' ')
            if index == -1:
                index = clid_part.find(';')
                if index == -1:
                    index = clid_part.find('?')
                    if index == -1:
                        return clid_part
    part = clid_part[:index]
    return part


def Reduce(key, recs):
    uid = key
    if uid[0] != 'y':
        return
    try:
        session = libra.ParseSession(recs, './blockstat.dict')

        for request in session:
            if not request.IsA('TYandexWebRequest'):
                continue

            # query = request.Query

            fullrequest = request.FullRequest
            referer = request.Referer
            text_referer = get_text(referer)
            
            str_text_referer = str(text_referer)
            fulltext = urllib.unquote(str_text_referer.encode("utf8"))

            text_referer_s = get_text(fullrequest)
            str_text_referer_s = str(text_referer_s)
            fulltext_s = urllib.unquote(str_text_referer_s.encode("utf8"))
            if not referer:

                if 'clid=2246298' in fullrequest:
                    yield Record(str(fulltext_s), '',
                                 str(fullrequest) + '\t' + str(referer) + '\t' + str(fulltext), tableIndex=0)
                if 'clid=2236843' in fullrequest:
                    yield Record(str(fulltext_s), '',
                                 str(fullrequest) + '\t' + str(referer) + '\t' + str(fulltext), tableIndex=1)
                if 'clid=2236844' in fullrequest:
                    yield Record(str(fulltext_s), '',
                                 str(fullrequest) + '\t' + str(referer) + '\t' + str(fulltext), tableIndex=2)
    except:
        return


def main():
    MapReduce.useDefaults(server="sakura.search.yandex.net",
                          username='userstats',
                          mrExec='/Berkanavt/bin/mapreduce-dev',
                          verbose=True)

    dd = ['0104','0105','0106','0107','0108','0109','0110','0111','0112','0113','0114','0115','0116','0117','0118']
    for day in dd:
        # src = 'sample_by_yuid_1p/user_sessions/2015' + day
        src = 'user_sessions/2016' + day

        dt0 = 'kosotis/ANSEARCH_526_extdate/first_query_voice'
        dt1 = 'kosotis/ANSEARCH_526_extdate/first_query_morda'
        dt2 = 'kosotis/ANSEARCH_526_extdate/first_query_stroka'

        # MapReduce.dropTable(dt0)
        # MapReduce.dropTable(dt1)
        # MapReduce.dropTable(dt2)

        MapReduce.runReduce(Reduce,
                            srcTable=src,
                            dstTables=[dt0, dt1, dt2],
                            files=['/home/kosotis/lib/blockstat.dict'],
                            appendMode=True,
                            sortMode=True
        )


if __name__ == '__main__':
    main()









