# -*- coding: UTF-8 -*-

from mapreducelib import MapReduce, Record
from collections import defaultdict
from datetime import datetime, timedelta, date
import urllib, re,random
import csv
import StringIO
from urlparse import urlparse, parse_qs

access_names = 'remote_ip blank1 blank2 time tz url http_status respond_size referer user_agent virtual_host x_forwarded_for cookies cts respond_time_sec respond_time_msec cluster_no reqid apache_pid balance    r_ip yuid fuid compress_ratio internal ruip x-yandex-suspected-robot x-yandex-internal-request passport_uid scheme test_ids headers'.split()

def Get_UA(rec):
    yp = rec.key
    line = rec.value

    buf = StringIO.StringIO(line)
    reader = csv.reader(buf, delimiter=' ', quotechar='"')
    try:
        data = reader.next()
    except csv.Error, e:
        #print e, rec.key
        yield Record(yp,'',str(e), tableIndex = 0)
        return

    try:
        url = data[5]
        handler = url.split('?')[0].split()[1]
    except IndexError:
        yield Record(yp,'',line, tableIndex = 0)
        return

    if handler != '/yandsearch' and handler != '/search/':
#        yield Record(yp,'',handler + '\t' + line, tableIndex = 0)
        return

    data = dict(zip(access_names, data))
    data['time'] = data['time'][1:]
    data['tz'] = data['tz'][:-1]

    try:
        uid = data['yuid']
        ua = data['user_agent']
    except:
        yield Record(yp,'',line, tableIndex = 1)

    yield Record(str(uid),'',yp + '\t' + str(ua), tableIndex = 2)

def MapACC(rec):
    line = rec.value
    i = line.find('; yp=')
    if i < 0:
        return

    yp = line[i:]
    j = yp[2:].find(';')
    if j < 0:
        yp = yp
    else:
        yp = yp[:j]

    if 'gpauto' in yp:
        yield Record(yp[:200],'',line)


def main():

    MapReduce.useDefaults(
                            server   = 'sakura.search.yandex.net:8013',
                            username = 'userstats',
                            mrExec   = '/Berkanavt/bin/mapreduce-dev',
                            verbose  = True,
                            #testMode = True,
                         )

    dd = ['1120']
    for d in dd:
        src = 'access_log/2015' + d

        dt = 'ensuetina/YP_COOKIE/try'

        continue

        MapReduce.runMap(MapACC,
                         srcTable = src,
                         dstTable = dt,
                         sortMode = True
                        )

    MapReduce.runMap(Get_UA,
                     srcTable = dt,
                     dstTables = [dt + '_parse_errors',dt + 'field_errors',dt + '_ua'],
                     sortMode = True
                    )


if __name__ == '__main__':
    main()
