#!/usr/bin/env python 
from mapreducelib import MapReduce, Record
import sys
import os
import optparse
import csv
import StringIO
from urlparse import urlparse, parse_qs

def parseOptions(argv):
    usage='usage: %prog <args>'
    parser = optparse.OptionParser(usage=usage)
    parser.add_option("-d", dest="date", type='string', help = "date")

    options, arrgs = parser.parse_args()

    if not options.date:
        parser.error('Please, specify date')

    return options

known_handlers = '/jsonproxy/search /searchapp /suggest /mobilesearch/vps'.split()
known_platforms = 'android apad'.split()
known_app_ids = 'ru.yandex.searchplugin'.split()

access_names = 'remote_ip blank1 blank2 time tz url http_status respond_size referer user_agent virtual_host x_forwarded_for cookies cts respond_time_sec respond_time_msec cluster_no reqid apache_pid balancer_ip yuid fuid compress_ratio internal ruip'.split()

cgi_names = 'type ui service uuid did lang app_id app_version app_platform clid manufacturer model os_version app_version_name app_build_number'.split()

to_output = 'type did clid app_version_name reqid app_platform clid'.split()

class Extractor:
    def __init__ (self, date):
        self.date = date

    def __call__ (self, rec):
        buf = StringIO.StringIO(rec.value)
        reader = csv.reader(buf, delimiter=' ', quotechar='"')
        try:
            data = reader.next()
        except csv.Error, e:
            #print e, rec.key
            return

        try:
            url = data[5]
            handler = url.split('?')[0].split()[1]
            if handler not in known_handlers:
                return
        except IndexError:
            return

        data = dict(zip(access_names, data))
        data['time'] = data['time'][1:]
        data['tz'] = data['tz'][:-1]

        if data['http_status'] != '200':
            return

        url_parsed = parse_qs(urlparse(data['url']).query)
        for cn in cgi_names:
            data[cn] = url_parsed[cn][0] if cn in url_parsed.keys() else 'None'

        if data['app_platform'] not in known_platforms:
            return
        if data['app_id'] not in known_app_ids:
            return

        yield Record('\t'.join([data['uuid'], data['yuid'], self.date]), handler, '\t'.join(data[x] for x in to_output))

def main(options):
    MapReduce.useDefaults(server='sakura.search.yandex.net:8013',
                            username='userstats',
                            mrExec='/Berkanavt/bin/mapreduce-dev',
                            verbose=True,
                            )
#    MapReduce.useDefaults(testMode=True)
    in_table  = 'access_log/'+options.date
    out_table = 'shining/android_app_access_serp_suggest_vps/'+options.date

    MapReduce.runMap(Extractor(options.date),
                        srcTable = in_table,
                        dstTable = out_table,
                        sortMode = True,
                        )

if __name__ == '__main__':
    options = parseOptions(sys.argv)
    main(options)
