#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division
import sys
import yt.wrapper as yt
import argparse
import urllib2
import json

junk = u'\\\'-_.,—"‘’“”•;:›><()#%@!^&*+=№?[]►▼✔/|`~' + u'\u0007'

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest = "server", help = "mapreduce server", default = 'hahn.yt.yandex.net:80', required = False)
    return parser

def lcs(a, b):
    lengths = [[0 for j in range(len(b)+1)] for i in range(len(a)+1)]
    # row 0 and column 0 are initialized to 0 already
    for i, x in enumerate(a):
        for j, y in enumerate(b):
            if x == y:
                lengths[i+1][j+1] = lengths[i][j] + 1
            else:
                lengths[i+1][j+1] = max(lengths[i+1][j], lengths[i][j+1])
    # read the substring out from the matrix
    result = ""
    x, y = len(a), len(b)
    while x != 0 and y != 0:
        if lengths[x][y] == lengths[x-1][y]:
            x -= 1
        elif lengths[x][y] == lengths[x][y-1]:
            y -= 1
        else:
            assert a[x-1] == b[y-1]
            result = a[x-1] + result
            x -= 1
            y -= 1
    return result

def getserp(query):
    serp = []
    adapt = urllib2.quote(query)
    path = 'http://priemka.hamster.yandex.ru/yandsearch?&text=' + adapt + '&json_dump=searchdata.docs&waitall=da&nocache=da&i-am-a-hacker=1&no-tests=1&timeout=9999999999'
    try:
        u = urllib2.urlopen(path)
        data = u.read().decode('utf-8')
    except:
        print >> sys.stderr, 'connection problem'
        return []
    try:
        result = json.loads(data)
    except:
        print >> sys.stderr, 'no json!'
        return []
    for r in result['searchdata.docs']:
        doctitle = r['doctitle'].lower()
        for j in junk:
            doctitle = doctitle.replace(j,'')
        tmp = doctitle.split(' ')
        for t in tmp:
            if not t in serp:
                serp.append(t)
    return serp


def getsnippet(query):
    serp = []
    adapt = urllib2.quote(query)
    path = 'http://priemka.hamster.yandex.ru/yandsearch?&text=' + adapt + '&json_dump=searchdata.docs&waitall=da&nocache=da&i-am-a-hacker=1&no-tests=1&timeout=9999999999'
    try:
        u = urllib2.urlopen(path)
        data = u.read().decode('utf-8')
    except:
        print >> sys.stderr, 'connection problem'
        return []
    try:
        result = json.loads(data)
    except:
        print >> sys.stderr, 'no json!'
        return []
    for r in result['searchdata.docs']:
        headline = r['headline'].lower()
        for j in junk:
            headline = headline.replace(j,'')
        tmp = headline.split(' ')
        for t in tmp:
            if not t in serp:
                serp.append(t)
    return serp


def goplease(query):
    path = 'https://yastroka.yandex.net/speech?user_input=' + urllib2.quote(query)
    try:
        u = urllib2.urlopen(path)
        data = u.read().decode('utf-8')
    except:
        print >> sys.stderr, 'connection problem'
        return []
    try:
        result = json.loads(data)
    except:
        print >> sys.stderr, 'no json!'
        return {}
    return result


def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False


def main():
    args = HandleOption().parse_args()
    yt.update_config({'proxy': {'url': args.server}})

    input = '//home/voice/toloka/ru-RU/mobile'
    for row in yt.read_table(input):
        if row['mark'] != 'TEST':
            continue
        else:
            query = row['text']
            handle = goplease(query)
            c = handle['candidates'][0]
            original = query.decode('utf-8')
            recognised = c['text']#.decode('utf-8')
            m1 = len(lcs(recognised, original)) / max(len(recognised), len(original))
            serp_rec = getserp(query)
            sni_rec = getsnippet(query)
            serp_orig = getserp(c['text'].encode('utf-8'))
            sni_orig = getsnippet(c['text'].encode('utf-8'))
            if len(serp_rec) > 0 or len(serp_orig) > 0:
                m2 = len(set(serp_rec).intersection(serp_orig)) / len(set(serp_rec).union(serp_orig))
            else:
                m2 = -1
            if len(sni_rec) > 0 or len(sni_orig) > 0:
                m3 = len(set(sni_rec).intersection(sni_orig)) / len(set(sni_rec).union(sni_orig))
            else:
                m3 = -1
            print '\t'.join([str(original == recognised), original.encode('utf-8'), recognised.encode('utf-8'), c['source'].encode('utf-8'),str(m1), str(m2), str(m3)])#.encode('utf-8')

if __name__ == '__main__':
    main()
