#!/usr/bin/env python

import sys
import urllib2
import urllib
import datetime
from collections import defaultdict

date_format = "%Y-%m-%d"

REGIONS = {'RU': 225, 'TR': 983, 'UA': 187, 'BY': 149, 'KZ': 159}
YANDEX = [2, 181, 13]
GOOGLE = [3, 68]
MAIL = [6, 119]

def get_clickhouse_data(query, start_date, end_date, region,
                        method = 'GET',
                        connection_timeout=1500,
                        host='http://mtmega01i.yandex.ru:8123/'):
    #query = "SELECT 1"
    assert method in ['GET', 'POST'], 'Method can be either GET or POST'

    query_items = query.split(' ')
    query_items = filter(lambda x: x!="", query_items)
    query = ' '.join(query_items)

    start_date_str = start_date.strftime(date_format)
    end_date_str = end_date.strftime(date_format)

    query = query.replace('<start_date>', start_date_str)
    query = query.replace('<end_date>', end_date_str)
    query = query.replace('<region>', str(REGIONS[region]))
    #query = "SHOW TABLES FROM default"
    print >> sys.stderr, query

    query = {'query': query,
             'user': 'analytics',
             'password': 'nxYw9S5G'
    }

    query_get = urllib.urlencode(query)
    url = host + '?' + query_get
    print >> sys.stderr, url
    req = urllib2.Request(url)
    if method == 'POST':
        res = urllib2.urlopen(req, timeout=connection_timeout, data = '')
    else:
        res = urllib2.urlopen(req, timeout=connection_timeout)
    print >> sys.stderr, host

    result = res.read()
    return result

def get_market_share(start_date, end_date, region):
    #gets MSIE8 market share on Russian SERP 

    query ="SELECT \
                sum(Sign*PageViews), \
                SearchEngineID, \
                toDate(StartDate) \
            FROM \
                visits_all \
            WHERE \
                    (StartDate >= toDate('<start_date>')) \
                AND (StartDate <= toDate('<end_date>')) \
                AND (regionToCountry(RegionID) = <region>) \
                AND (UserAgent = 5) \
                AND (UserAgentMajor = 8) \
                AND NOT (SearchEngineID IN (0, 74)) \
            GROUP BY \
                SearchEngineID, \
                toDate(StartDate) \
            "

    data = get_clickhouse_data(query, start_date, end_date, region)

    market_share_data = defaultdict(lambda: {'all': 0, 'yandex': 0})
    for line in data.strip().split('\n'):
        count, se, date = line.split()
        count = float(count)
        se = int(se)

        if se in YANDEX: market_share_data[date]['yandex'] += count
        market_share_data[date]['all'] += count

    out = ''
    for date, v in sorted(market_share_data.items(), key = lambda x: x[0]):
        out += date + '\t' + str(v['yandex']/v['all']) + '\n'
    return out

def get_serp_hits(start_date, end_date, region):
    #gets MSIE8 hits on Russian SERP" 
    query ="SELECT \
                count(), \
                EventDate\
            FROM \
                hits_all \
            WHERE \
                    (EventDate >= toDate('<start_date>')) \
                AND (EventDate <= toDate('<end_date>')) \
                AND (regionToCountry(RegionID) = <region>) \
                AND (UserAgent = 5) \
                AND (UserAgentMajor = 8) \
                AND NOT DontCountHits \
                AND (CounterID = 731962) \
            GROUP BY \
                EventDate \
            "

    return get_clickhouse_data(query, start_date, end_date, region)

if __name__ == "__main__":
    start_date = datetime.date(2015, 8, 1)
    end_date = datetime.date(2015, 9, 27)
    region = 'RU'

    #data = get_serp_hits(start_date, end_date, region)
    data = get_market_share(start_date, end_date, region)

    print data
