#!/usr/bin/env python
# encoding: utf-8

import argparse
import json
import re


SAAS_KEY_PREFIX = "talents_"


class TSnippetValidator(object):
    def _validate_placeholders(self, text, placeholders):
        tmp_text = text
        for p in placeholders:
            tmp_text = re.sub('%' + p + '%', '', tmp_text)
        if tmp_text.find('%') != -1:
            raise Exception("Some extra placeholders found in text: '%s'" % text)

    def _validate_link(self, link):
        self._validate_placeholders(link, ['geo_id', 'search_text'])
        if link.find('from=serp') == -1:
            raise Exception("No 'from=serp' in url: %s" % link)

    def validate_snippet_record(self, rec):
        for field in ['button', 'id', 'job_category', 'link', 'region_id', 'snippet', 'title']:
            if not rec.get(field):
                raise Exception("Field '%s' is missing or empty" % field)

        self._validate_link(rec['link'])

        self._validate_placeholders(rec['snippet'], ['preposition', 'prepositional_case'])
        self._validate_placeholders(rec['title'], ['preposition', 'prepositional_case'])

        if ('sitelinks' in rec) and (type(rec['sitelinks']) == list):
            for sl in rec['sitelinks']:
                for field in ['title', 'url']:
                    if not sl.get(field):
                        raise Exception("Field '%s' is missing or empty in sitelink item" % field)
                self._validate_link(sl['url'])


class TTalentsDataGenerator(object):
    def __init__(self, yt_server, yt_data_path, yt_token_file):
        from yt.wrapper import YtClient
        yt_token = open(yt_token_file).read().strip()
        self.yt_client = YtClient(proxy=yt_server, token=yt_token)
        self.yt_data_path = yt_data_path
        self.validator = TSnippetValidator()

    def generate_saas_data(self, output_file):
        snippets_table = self.yt_data_path + '/snippets'
        snippets = {}
        for rec in self.yt_client.read_table(snippets_table):
            try:
                self.validator.validate_snippet_record(rec)
            except Exception as e:
                raise Exception("%s in record: '%s'" % (e, json.dumps(rec, ensure_ascii=False)))

            key = rec['job_category'] + '_' + str(rec['region_id'])
            snippets[key] = rec

        with open(output_file, 'w') as f:
            for key, data in snippets.iteritems():
                key = SAAS_KEY_PREFIX + key
                f.write(key + "\t" + json.dumps(data) + "\n")

    def generate_wizard_data(self, output_file):
        job_categories_table = self.yt_data_path + '/job_categories'
        wizard_out = open(output_file, 'w')
        wizard_out.write("YaTalents\n\n")
        wizard_out.write("# This file is auto-generated\n\n")

        for rec in self.yt_client.read_table(job_categories_table):
            category_data = {"tag": rec["tag"]}
            wizard_out.write("=job_category:" + json.dumps(category_data, separators=(',', ':')) + "\n\n")
            wizard_out.write("@RUS\n")
            for lemma in rec["lemmas"]:
                wizard_out.write(lemma + "\n")
            wizard_out.write("\n\n")

        wizard_out.close()


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--action", dest="action", help="generate-saas-data or generate-wizard-data", required=True)
    parser.add_argument("--output", dest="output", help="Output file name for saas or wizard data", default="saas.data")
    parser.add_argument("--yt_token_file", help="Path to file with YT token", required=True)
    parser.add_argument("--yt_server", help="YT server short name", default="hahn")
    parser.add_argument("--yt_data_path", help="Path to directory with snippets data on YT server", required=True)
    args = parser.parse_args()

    generator = TTalentsDataGenerator(args.yt_server, args.yt_data_path, args.yt_token_file)

    if args.action == "generate-saas-data":
        generator.generate_saas_data(args.output)
    elif args.action == "generate-wizard-data":
        generator.generate_wizard_data(args.output)
    else:
        raise Exception('Bad action')


if __name__ == '__main__':
    main()
