#!/usr/bin/env python
from itertools import chain

import logging
import time
import psycopg2
import psycopg2.extras
import nirvana.job_context as nv
import uuid
from psycopg2.extensions import string_types

PG_to_YT_type_mapper = {
    'DATE': 'string',
    'TIME': 'string',
    'DATETIME': 'string',
    'DATETIMETZ': 'string',
    'LONGINTEGER': 'int64',
    'INTEGER': 'int32',
    'STRING': 'utf8',
    'BINARY': 'string',
    'FLOAT': 'double',
    'BOOLEAN': 'boolean',
    'DECIMAL': 'double',
}


def retry(n_tries=3, exception_cls=Exception, sleep=0):
    def inner(f):
        def wrapped(*args, **kwargs):
            n = 0
            while True:
                try:
                    return f(*args, **kwargs)
                except exception_cls:
                    if n < n_tries:
                        n += 1
                        time.sleep(sleep)
                        continue
                    else:
                        raise
        return wrapped
    return inner


def datetime_to_iso(dt):
    if dt:
        return dt.isoformat()


converters = {
    'DATE': datetime_to_iso,
    'TIME': datetime_to_iso,
    'DATETIME': datetime_to_iso,
    'DATETIMETZ': datetime_to_iso,
}


def yt_type(column):
    return PG_to_YT_type_mapper.get(string_types[column.type_code].name, 'any')


def get_schema(cursor):
    return [{'name': column.name, 'type': yt_type(column)} for column in cursor.description]


def get_converter(column):
    return converters.get(string_types[column.type_code].name, lambda x: x)


def get_converters(cursor):
    return {column.name: get_converter(column) for column in cursor.description}


def ResultIter(cursor_buffered, converts):
    for result in cursor_buffered:
        yield {k: converts[k](v) for k, v in result.items()}


class Greenplum(object):
    def __init__(self, server, port, database, user, token):
        connection_string = 'postgresql://{}:{}@{}:{}/{}'.format(user, token, server, port, database)
        self.connection = psycopg2.connect(dsn=connection_string)

    def run(self, query, chunk_size):
        cursor = self.connection.cursor(cursor_factory=psycopg2.extras.DictCursor, name=uuid.uuid4().hex)
        cursor.itersize = chunk_size
        cursor.execute(query)
        return cursor

    def commit(self):
        self.connection.commit()

    def close(self):
        self.connection.close()


def write_to_yt(parameters, cursor):
    import yt.wrapper as yt

    append = parameters['yt-append']
    yt_client = yt.YtClient(
        token=parameters['yt-token'],
        proxy=parameters['yt-proxy'],
    )
    table = yt.TablePath(parameters['yt-table'], append=append)

    cursor_buffered, cursor_to_move = tee(cursor)
    # move cursor to read cursor description
    first_element = next(cursor_to_move)
    converters = get_converters(cursor)
    iterator = chain()
    result = ResultIter(chainz, )
    schema = get_schema(cursor)
    with yt_client.Transaction():
        yt_client.create('table', table, attributes={'schema': schema, 'optimize_for': 'scan'}, force=True)
        yt_client.write_table(table, result)


def main():
    job_context = nv.context()
    parameters = job_context.get_parameters()
    inputs = job_context.get_inputs()

    with open(inputs.get('query')) as f:
        query = f.read()

    @retry(n_tries=parameters['n-tries'], sleep=10, exception_cls=psycopg2.OperationalError)
    def run(query, parameters):
        cursor = None
        greenplum = None
        try:
            greenplum = Greenplum(
                server=parameters['server'],
                port=parameters['port'],
                database=parameters['database'],
                user=parameters['user'],
                token=parameters['token']
            )

            cursor = greenplum.run(query, chunk_size=parameters['chunk-size'])
            if parameters.get('yt-table'):
                write_to_yt(parameters, cursor)
        finally:
            if cursor:
                cursor.close()
            if greenplum:
                greenplum.close()

    run(query, parameters)

if __name__ == '__main__':
    main()

import yt.wrapper as yt
import pandas as pd
import os
import json
import io
import argparse
from itertools import islice
from sqlalchemy import create_engine, exc

parser = argparse.ArgumentParser()
parser.add_argument("user", type=str,
                    help="user")
parser.add_argument("token", type=str,
                    help="GP token")
parser.add_argument("yt_table", type=str,
                    help="Path to yt_table")
parser.add_argument("gp_table", type=str,
                    help="Path to gp_table")
parser.add_argument("if_exists", type=str,
                    help="if_exists")
parser.add_argument("chunksize", type=int,
                    help="chunksize")
parser.add_argument("grant", type=str,
                    help="grant")

args = parser.parse_args()
user = args.user
token = args.token
yt_table = args.yt_table
gp_table = args.gp_table
if_exists = args.if_exists
chunksize = args.chunksize
grant = args.grant


class GreenplumManager(object):

    @staticmethod
    def write(df, db_engine, table_name, if_exists='replace', sep='|',
              index=False, encoding='utf-8', dtype=None, **kwargs):
        """
        Loads table on GP
        :param df: pd.DataFrame
        :param db_engine: sqlalchemy create_engine result
        :param table_name: greenplum table name
        :param if_exists: action when GP table exists replace/fail/append
        :param sep: to_csv separator
        :param index: use df index
        :param encoding:
        :param kwargs:
        :return:
        """
        schema, table_name = table_name.split('.')
        string_data_io = io.BytesIO()
        df.to_csv(string_data_io, sep=sep, index=index, encoding=encoding)
        pd_sql_engine = pd.io.sql.pandasSQL_builder(db_engine)
        table = pd.io.sql.SQLTable(table_name,
                                   pd_sql_engine,
                                   frame=df,
                                   index=index,
                                   if_exists=if_exists,
                                   schema=schema,
                                   dtype=dtype)
        table.create()
        string_data_io.seek(0)
        with db_engine.connect() as connection:
            with connection.connection.cursor() as cursor:
                copy_cmd = "COPY {}.{} FROM STDIN HEADER DELIMITER '{}' CSV".format(schema, table_name, sep)
                cursor.copy_expert(copy_cmd, string_data_io)
            connection.connection.commit()

    @staticmethod
    def grouper(iterable, chunksize=10**5, **kwargs):
        """
        Iterate over the iterable object in chunks
        :param iterable: iterable object
        :param chunksize: number of items in one chunk
        :param kwargs: dummy kwargs
        :return:
        """
        while True:
            chunk = list(islice(iterable, chunksize))
            if not chunk:
                return
            yield chunk

    @staticmethod
    def initialize_yt_wrapper(db='hahn', **kwargs):
        yt.config["proxy"]["url"] = db

    def __init__(self, server='gpdb-master.taxi.yandex.net', port=5432, database='ritchie',
                 connect_args={}, **kwargs):

        self.engine = create_engine(
            'postgresql://{}:{}@{}:{}/{}'.format(user,
                                                 token,
                                                 server,
                                                 port,
                                                 database),
            connect_args=connect_args)
        self.connection = self.engine.connect()
        self.params = kwargs

    def __call__(self, query, **kwargs):
        """
        Executes query and returns result as pd.DataFrame
        :param query: query
        :param kwargs: kwargs to pass in pd.read_sql
        :return: pd.DataFrame
        """
        result = None
        try:
            result = pd.read_sql(query, self.connection, **kwargs)
        except exc.ResourceClosedError:
            result = None
        self.connection.connection.commit()
        return result

    def load_result(self, table_name=None, **kwargs):
        """
        Returns GP table as pd.DataFrame
        :param table_name: greenplum table name
        :param kwargs: kwargs to pass in pd.read_sql_table
        :return: pd.DataFrame
        """
        schema, table_name = table_name.split('.')
        result = pd.read_sql_table(table_name, self.connection, schema=schema, **kwargs)
        self.connection.connection.commit()
        return result

    def write_table(self, table_name, table, **kwargs):
        """
        Uploads pd.DataFrame on GP
        :param table_name: greenplum table name
        :param table: pd.DataFrame
        :param kwargs: kwargs to pass in self.write
        :return: None
        """
        self.write(table, self.engine, table_name, **kwargs)

    def replicate(self, yt_path, table_name, if_exists='replace', **kwargs):
        """
        Replicates YT table onto GP
        :param yt_path: YT table path
        :param table_name: greenplum table name
        :param if_exists: replace/append
        :param kwargs: kwargs to pass in self.write and self.grouper
        :return: None
        """
        self.initialize_yt_wrapper(**self.params)
        if if_exists == 'replace':
            self.connection.execute('drop table if exists {}'.format(table_name))
        data = yt.read_table(yt_path)
        for chunk in self.grouper(data, **kwargs):
            self.write(pd.DataFrame(chunk), self.engine, table_name, if_exists='append', **kwargs)


greenplum = None
try:
    greenplum = GreenplumManager()
except (KeyError, exc.OperationalError) as e:
    pass


greenplum.replicate(yt_path = yt_table,
                    table_name= gp_table, if_exists= if_exists, chunksize=chunksize)

query = 'GRANT SELECT ON ' + gp_table + ' to ' + grant

greenplum(query)
