#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division
import argparse
import json
import sys
import datetime
import time

def HandleOptions():
    parser = argparse.ArgumentParser()
    parser.add_argument("--serps", dest="serps", help="File with Yandex results")
    parser.add_argument("--outq", dest="outq", help="Output queries")
    parser.add_argument("--ts", dest="ts", help="timestamp")
    parser.add_argument("--dev", dest="device", help="Mode: touch|desktop", default = "touch")
    return parser

def clean(url):
    url = url.replace('https://','')
    url = url.replace('http://','')
    url = url.replace('www.','')
    if url == '':
        return ''
    if url[len(url)-1] == '/':
        url = url[:-1]
    if u".sdamgia.ru" in url:
        url = u"sdamgia.ru"
    return url.lower()


def statface_send(value, mode = "recall", path = "rnd"):
    for i in range(10):
        try:
            r = requests.post("https://upload.stat.yandex-team.ru/_api/report/data",
                              headers = {"StatRobotUser": "robot_itajn", "StatRobotPassword": "e0nes7uraS5eSti"},
                              data = {"name": "Tutor/" + mode + "/" + path,
                                      "scale": "d",
                                      "json_data": json.dumps({"values": [value]}),
                                     },
                              timeout = 10
                             )
            break
        except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
            print >> sys.stderr, "Can\"t connect to statface, try #", i
            print >> sys.stderr, e
            time.sleep(10)
            continue

def main():
    parser = HandleOptions()
    args = parser.parse_args()

    timestamp = args.ts[:10]
    competitors = ["sdamgia.ru", "4ege.ru", "onlyege.ru", "self-edu.ru", "ucheba.ru", "vpr-ege.ru", "ctege.info", "tolkoexamen.ru", "neznaika.info", "100balnik.com", "vprklass.ru", "reshimvse.com", "examer.ru", "ege59.ru", "examen.ru", "bingoschool.ru", "ugdz.ru", "euroki.app", "ege.edu.ru", "vopvet.ru", "5-ege.ru", "rustutors.ru", "rustest.ru", "poznaemvmeste.ru", "cknow.ru", "academyege.ru", "ege-study.ru", "russkiykim.ru", "compendium.su", "abiturient24.com", "metodarhiv.ru", "worksbase.ru", "maximumtest.ru", "math100.ru", "bugaga.net.ru", "egemaximum.ru", "ege-essay.ru", "owlearn.ru", "russkiy-na-5.ru", "5litra.ru", "egeprof.ru", "alexlarin.com", "smartrepetitor.ru"]

    i = 0
    querystat = {}
    with open(args.serps, "r") as serps:
        for line in serps:
            if line.strip() == "," or line.strip() == "[" or line.strip() == "]":
                continue
            if not i % 100:
                print >> sys.stderr, "progress: %s" % (100.0*i/10000)
            i += 1
            s = json.loads(line.strip())
            com = False
            egeurl = None
            urllist = []
            tutor_url = None
            tutor_pos = None
            tutor_right = False
            query = s["serp-request-explained"]["per-query-parameters"]["query-text"].encode("utf-8")
            pos = 0
            if not s.get("serp-page") or not "parser-results" in s["serp-page"] or not s["serp-page"].get("parser-results") or not "components" in  s["serp-page"]["parser-results"]:
                print >> sys.stderr, "No serp found!", json.dumps(s)
                continue
            for res in  s["serp-page"]["parser-results"]["components"]:
                url = unicode(res.get("page-url"))
                clean_url = clean(url)
                if clean_url == u"sdamgia.ru":
                    com = True
                    urllist.append(url)
                    egeurl = url
                for h in competitors:
                    if clean_url.startswith(h):
                        com = True
                        urllist.append(url)
                        break
                if u"yandex.ru/tutor" in url:
                    tutor_url = url if not tutor_url else tutor_url + "," + url
                    if res["alignment"] == "RIGHT":
                        tutor_right = True
                    else:
                        tutor_pos = pos
                if res["alignment"] == "LEFT":
                    pos += 1
            if com or tutor_pos is not None or tutor_right:
                querystat[query] = {"has_competition" : com, "urllist" : urllist,
                                    "tutor_url" : tutor_url, "tutor_pos" : tutor_pos, "tutor_right" : tutor_right}

    all = 0
    shows = 0
    for q in querystat:
        if querystat[q]["has_competition"]:
            all += 1
        if querystat[q]["tutor_pos"] is not None or querystat[q]["tutor_right"]:
            shows += 1
    statface_send({"fielddate" : datetime.datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d"), "device" : args.device,
                "recall" : shows/all if all else 0
                })

    with open(args.outq, "w") as outq:
        for q in querystat:
            outq.write("\t".join([q, str(querystat[q]["has_competition"]), str(querystat[q]["urllist"]), str(querystat[q]["tutor_url"]), str(querystat[q]["tutor_pos"]), str(querystat[q]["tutor_right"])]) + "\n")

if __name__ == "__main__":
    main()
