#!/usr/bin/env python 
from mapreducelib import MapReduce, Record
import sys
import os
import optparse
import csv
import StringIO
from urlparse import urlparse, parse_qs

def parseOptions(argv):
    usage='usage: %prog <args>'
    parser = optparse.OptionParser(usage=usage)
    parser.add_option("-d", dest="date", type='string', help = "date")

    options, arrgs = parser.parse_args()

    if not options.date:
        parser.error('Please, specify date')

    return options

known_handlers = '/search /yandsearch'.split()

access_names = 'remote_ip blank1 blank2 time tz url http_status respond_size referer user_agent virtual_host x_forwarded_for cookies cts respond_time_sec respond_time_msec cluster_no reqid apache_pid balancer_ip yuid fuid compress_ratio internal ruip susp_robot internal_request passport_uid scheme testids headers'.split()

to_output = 'http_status url referer'.split()

testids_2check = '17484 17485 17486 17487 17493 17492 17494'.split()

class Extractor:
    def __init__ (self, date):
        self.date = date

    def __call__ (self, rec):
        buf = StringIO.StringIO(rec.value)
        reader = csv.reader(buf, delimiter=' ', quotechar='"')
        try:
            data = reader.next()
        except csv.Error, e:
            #print e, rec.key
            return

        try:
            url = data[5]
            url = url.split(' ')[1]

            handler = None
            for kh in known_handlers:
                if url.startswith(kh):
                    handler = kh
            if not handler:
                return

        except IndexError:
            return

        data = dict(zip(access_names, data))
        data['time'] = data['time'][1:]
        data['tz'] = data['tz'][:-1]


        if data['http_status'] != '302':
            return

        if 'testids' not in data.keys():
            return
        if data['testids'] == '-':
            return

        testids = data['testids'].split(',')
        found = None
        for ti in testids_2check:
            if ti in testids:
                found = ti
                break

        if not found:
            return

        yield Record('\t'.join([data['yuid'], ti]), handler, '\t'.join(data[x] for x in to_output))

def main(options):
    MapReduce.useDefaults(server='sakura.search.yandex.net:8013',
                            username='userstats',
                            mrExec='/Berkanavt/bin/mapreduce-dev',
                            verbose=True,
                            )
#    MapReduce.useDefaults(testMode=True)
    in_table  = 'access_log/'+options.date
    out_table = 'shining/RESEARCH-1494/'+options.date
#    in_table = 'shining/tmp'
#   out_table = 'shining/tmp2'


    MapReduce.runMap(Extractor(options.date),
                        srcTable = in_table,
                        dstTable = out_table,
                        sortMode = True,
                        )

if __name__ == '__main__':
    options = parseOptions(sys.argv)
    main(options)
