#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division
import sys
import argparse
import urllib2
import json

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--file", dest = "file", help = "input with tab-separated strings", required = True)
    return parser

def lcs(a, b):
    lengths = [[0 for j in range(len(b)+1)] for i in range(len(a)+1)]
    # row 0 and column 0 are initialized to 0 already
    for i, x in enumerate(a):
        for j, y in enumerate(b):
            if x == y:
                lengths[i+1][j+1] = lengths[i][j] + 1
            else:
                lengths[i+1][j+1] = max(lengths[i+1][j], lengths[i][j+1])
    # read the substring out from the matrix
    result = ""
    x, y = len(a), len(b)
    while x != 0 and y != 0:
        if lengths[x][y] == lengths[x-1][y]:
            x -= 1
        elif lengths[x][y] == lengths[x][y-1]:
            y -= 1
        else:
            assert a[x-1] == b[y-1]
            result = a[x-1] + result
            x -= 1
            y -= 1
    return result

def getserp(query):
    serp = []
    adapt = urllib2.quote(query)
    path = 'https://hamster.yandex.ru/yandsearch?&text=' + adapt + '&json_dump=searchdata.docs&waitall=da'
    u = urllib2.urlopen(path)
    data = u.read().decode('utf-8')
    try:
        result = json.loads(data)
    except:
        print >> sys.stderr, 'no json!'
        return []
    for r in result['searchdata.docs']:
        serp.append(r['url'])
    return serp


def main():
    args = HandleOption().parse_args()
    with open(args.file, 'r') as file:
        for line in file:
            tmp = line.strip().split('\t')
            recognised = tmp[0].decode('utf-8')
            original = tmp[1].decode('utf-8')
            m1 = len(lcs(recognised, original)) / max(len(recognised), len(original))
            serp_rec = getserp(tmp[0])
            serp_orig = getserp(tmp[1])
            if len(serp_rec) > 0 or len(serp_orig) > 0:
                m2 = len(set(serp_rec).intersection(serp_orig)) / len(set(serp_rec).union(serp_orig))
            else:
                m2 = -1
            print m1, m2

if __name__ == '__main__':
    main()
