# -*- coding: utf-8 -*-

from argparse import ArgumentParser
import logging
import sys
from collections import defaultdict
import gzip
import os
import shutil
from xml.etree import ElementTree
import datetime

from yt.wrapper import YtClient

import boto3

from travel.library.python.tools import replace_args_from_env
from travel.hotels.lib.python3.yql import yqllib
from travel.hotels.lib.python3.yt import ytlib
from travel.hotels.tools.sitemap_builder import parameters


class Runner(object):
    def __init__(self, args):
        self.yql_client = yqllib.create_client(db=args.yt_proxy, token=args.yql_token, token_path=args.yql_token_path)

        yt_config = {
            'token': args.yt_token,
            'token_path': args.yt_token_path,
        }
        self.yt_client = YtClient(proxy=args.yt_proxy, config=yt_config)
        self.args = args
        today = datetime.date.today()
        self.today_str = today.isoformat()

    def run(self):
        self.do_work(self.args.yt_sitemap_path)

    def do_work(self, work_path):
        self._ensure_yt_dir(work_path)
        hotels_table = ytlib.join(work_path, 'hotels')
        sitemaps_local_root = 'sitemaps'
        self._recreate_local_dir(sitemaps_local_root)

        self.get_hotels(hotels_table)
        sitemap_files = self.generate_sitemap_files(hotels_table, sitemaps_local_root)

        self.sync_with_s3(sitemaps_local_root, sitemap_files)

    def _ensure_yt_dir(self, yt_path):
        self.yt_client.create('map_node', yt_path, recursive=True, ignore_existing=True)

    def _recreate_local_dir(self, path):
        if os.path.exists(path):
            shutil.rmtree(path)
        os.mkdir(path)

    def get_hotels(self, hotels_table):
        logging.info("Determine list of hotels")
        yqllib.run_yql_file(
            self.yql_client,
            'get_hotels.yql', 'SitemapBuilder',
            parameters={
                '$rubric_permalinks': parameters.HOTEL_RUBRIC_PERMALINKS,
                '$feature_group_ids': parameters.FEATURE_GROUP_IDS,
                '$hotel_limit': parameters.HOTEL_LIMIT,
                '$hotel_slugs_table': self.args.yt_hotel_slugs_export_path,
                '$output_path': hotels_table,
            },
        )

    def generate_sitemap_files(self, hotels_table, sitemaps_local_root):
        sitemap_files = list()

        # per-country sitemaps for hotels
        sitemap_files.extend(self.generate_hotels_sitemap_files(hotels_table, sitemaps_local_root))

        # Different pages sitemap
        filename = 'sitemap.pages.xml.gz'
        sitemap_files.append(filename)
        urls = [f'{self.args.portal_url}/hotels/']
        self.generate_sitemap_file(sitemaps_local_root, filename, urls, is_root_file=False)

        # Region pages sitemap
        urls = self.generate_regions_urls()
        if urls:
            filename = 'sitemap.regions.xml.gz'
            sitemap_files.append(filename)
            self.generate_sitemap_file(sitemaps_local_root, filename, urls, is_root_file=False)

        # Root sitemap
        root_filename = 'sitemap.xml.gz'
        root_urls = (f'{self.args.portal_url}/sitemaps/hotels/{f}' for f in sitemap_files)
        self.generate_sitemap_file(sitemaps_local_root, root_filename, root_urls, is_root_file=True)
        return sitemap_files + [root_filename]

    def generate_hotels_sitemap_files(self, hotels_table, sitemaps_local_root):
        sitemap_files = list()
        logging.info("Reading list of hotels from YT")
        hotel_slugs_by_key = defaultdict(list)
        for row in self.yt_client.read_table(hotels_table):
            country = row['country_iso_name']
            if country in parameters.SEPARATE_COUNTRIES:
                key = country.lower()
            else:
                key = 'other'
            hotel_slugs_by_key[key].append(row['slug'])
        for key, slugs in hotel_slugs_by_key.items():
            offset = 0
            idx = 1
            while offset < len(slugs):
                if idx == 1:
                    suffix = key
                else:
                    suffix = f'{key}_{idx}'
                filename = f'sitemap.{suffix}.xml.gz'
                sitemap_files.append(filename)
                logging.info(f"Generating sitemap file {filename}")
                urls = (self.get_hotel_page_url(slug) for slug in slugs[offset:(offset + parameters.SITEMAP_FILE_RECORD_LIMIT)])
                self.generate_sitemap_file(sitemaps_local_root, filename, urls, is_root_file=False)
                offset += parameters.SITEMAP_FILE_RECORD_LIMIT
                idx += 1
        logging.info("Reading list of hotels from YT - finished")
        return sitemap_files

    def generate_regions_urls(self):
        urls = []
        if self.args.yt_region_pages_path:
            logging.info(f"Reading list of regions from YT table {self.args.yt_region_pages_path}")
            for row in self.yt_client.read_table(self.yt_client.TablePath(self.args.yt_region_pages_path, columns=['slug', 'filter_slug'])):
                urls.append(self.get_region_page_url(row['slug'], row['filter_slug']))
            logging.info("Reading list of regions from YT - Finished")
        return urls

    def get_hotel_page_url(self, slug):
        return f'{self.args.portal_url}/hotels/{slug}/'

    def get_region_page_url(self, slug, filter_slug):
        res = f'{self.args.portal_url}/hotels/{slug}/'
        if filter_slug:
            res += f'filter-{filter_slug}/'
        return res

    def generate_sitemap_file(self, sitemaps_local_root, filename, urls, is_root_file):
        # https://www.sitemaps.org/protocol.html
        root = ElementTree.Element(
            'sitemapindex' if is_root_file else 'urlset',
            attrib={
                'xmlns': 'http://www.sitemaps.org/schemas/sitemap/0.9'
            }
        )
        for url in urls:
            root.append(self.url2xml(url, is_root_file))
        tree = ElementTree.ElementTree(root)

        full_filename = os.path.join(sitemaps_local_root, filename)
        with open(full_filename, 'wb') as fgz:
            with gzip.GzipFile(fileobj=fgz, mode="w") as f:
                tree.write(f, encoding='utf-8', xml_declaration=True)

    def url2xml(self, url, is_root_file):
        url_tag = ElementTree.Element('sitemap' if is_root_file else 'url')
        loc_tag = ElementTree.Element('loc')
        loc_tag.text = url
        url_tag.append(loc_tag)
        if not is_root_file:
            if self.args.changefreq:
                changefreq_tag = ElementTree.Element('changefreq')
                changefreq_tag.text = self.args.changefreq
                url_tag.append(changefreq_tag)
            if self.args.add_lastmod:
                lastmod_tag = ElementTree.Element('lastmod')
                lastmod_tag.text = self.today_str
                url_tag.append(lastmod_tag)
            if self.args.priority:
                priority_tag = ElementTree.Element('priority')
                priority_tag.text = self.args.priority
                url_tag.append(priority_tag)

        return url_tag

    def sync_with_s3(self, sitemaps_local_root, file_list):
        s3_pfx = self.args.s3_prefix
        if not s3_pfx.endswith('/'):
            s3_pfx += '/'
        session = boto3.session.Session(
            aws_access_key_id=self.args.s3_access_key,
            aws_secret_access_key=self.args.s3_access_secret_key,
        )
        s3 = session.client(service_name='s3', endpoint_url=self.args.s3_endpoint, verify=False)
        allowed_keys = set()
        for fn in file_list:
            local_fn = os.path.join(sitemaps_local_root, fn)
            key = s3_pfx + fn
            logging.info(f'Uploading {local_fn} -> {key}')

            s3.put_object(
                Bucket=self.args.s3_bucket,
                Key=key,
                Body=open(local_fn, 'rb'),
            )
            allowed_keys.add(key)

        resp = s3.list_objects(Bucket=self.args.s3_bucket, Prefix=s3_pfx)
        for o in resp['Contents']:
            key = o['Key']
            if key in allowed_keys:
                pass
            else:
                logging.info(f"Remove '{key}'")
                s3.delete_object(Bucket=self.args.s3_bucket, Key=key)


def main():
    logging.basicConfig(level=logging.INFO, format="%(asctime)-15s | %(module)s | %(levelname)s | %(message)s", stream=sys.stdout)
    logging.getLogger('yt.packages.urllib3.connectionpool').setLevel(logging.WARNING)

    parser = ArgumentParser()
    parser.add_argument('--yt-proxy', default='hahn')
    parser.add_argument('--yt-token')
    parser.add_argument('--yt-token-path')
    parser.add_argument('--yql-token')
    parser.add_argument('--yql-token-path')
    parser.add_argument('--yt-sitemap-path', required=True)
    parser.add_argument('--yt-hotel-slugs-export-path', default='//home/travel/prod/general/slugs/latest/hotel_slugs_export')
    parser.add_argument('--yt-region-pages-path')
    parser.add_argument('--portal-url', default='https://travel.yandex.ru')
    parser.add_argument('--s3-endpoint', default='https://s3.mds.yandex.net')
    parser.add_argument('--s3-bucket', default='travel-indexer')
    parser.add_argument('--s3-prefix', default='sitemaps')
    parser.add_argument('--s3-access-key', required=True)
    parser.add_argument('--s3-access-secret-key', required=True)
    parser.add_argument('--changefreq')
    parser.add_argument('--priority')
    parser.add_argument('--add-lastmod', action='store_true')
    args = parser.parse_args(args=replace_args_from_env())
    Runner(args).run()


if __name__ == '__main__':
    main()
