#include "initializer.h"
#include "gil.h"
#include "ptr.h"

#include <solomon/agent/lib/python2/gil.h>

#include <util/generic/singleton.h>
#include <util/generic/yexception.h>
#include <util/stream/str.h>
#include <util/system/unaligned_mem.h>

#include <string>

namespace NPython2 {
namespace {

void FixPythonSysPath() {
    char pathVar[] = "path";
    char pathValue[] = "/usr/lib/python2.7/dist-packages";

    TObjectPtr sysPath(PySys_GetObject(pathVar), TObjectPtr::ADD_REF);
    Y_VERIFY(sysPath, "cannot get sys.path");

    TObjectPtr pyStr = PyString_FromStringAndSize(pathValue, sizeof(pathValue)-1);
    Y_VERIFY(pyStr, "cannot cat path var to python string");

    int rc = PyList_Append(sysPath.Get(), pyStr.Get());
    if (rc == 0) {
        rc = PySys_SetObject(pathVar, sysPath.Get());
    }

    Y_VERIFY(rc == 0, "cannot change sys.path");
}

// The following 16 byte alignment is required for py object allocations
// to prevent sigsegv in py plugins which uses ctypes because clang
// generates movaps and movaps requires the memory address to be aligned on 16 bytes.
// Bug links: https://st.yandex-team.ru/CLOUD-84697, https://bugs.python.org/issue36618
struct alignas(16) TBlockHeader {
    size_t Size;
};

/**
   From CPython source code, of original allocation functions:

   "Functions supplying platform-independent semantics for malloc/realloc/
   free.  These functions make sure that allocating 0 bytes returns a distinct
   non-NULL pointer (whenever possible -- if we're flat out of memory, NULL
   may be returned), even if the platform malloc and realloc don't.
   Returned pointers must be checked for NULL explicitly.  No action is
   performed on failure (no exception is set, no warning is printed, etc)".
*/

void FreeWrapper(void* ctx, void* ptr) {
    if (ptr == nullptr) {
        return;
    }

    TAllocContext* context = reinterpret_cast<TAllocContext*>(ctx);
    PyMemAllocator& alloc = context->OriginalAlloc;

    void* bp = reinterpret_cast<char*>(ptr) - sizeof(TBlockHeader);
    i64 size = ReadUnaligned<TBlockHeader>(bp).Size;

    context->AllocatedBytesPerDomain->Add(-size);

    return alloc.free(alloc.ctx, bp);
}

void* MallocWrapper(void* ctx, ui64 size) {
    TAllocContext* context = reinterpret_cast<TAllocContext*>(ctx);
    PyMemAllocator& alloc = context->OriginalAlloc;

    // To return a distinct non-NULL pointer
    if (size == 0) {
        size = 1;
    }

    ui64 bsize = size + sizeof(TBlockHeader);

    void* bptr = alloc.malloc(alloc.ctx, bsize);
    Y_ENSURE(bptr != nullptr, "Failed to allocate memory for Python interpreter");

    WriteUnaligned<TBlockHeader>(bptr, TBlockHeader{size});
    void* ptr = reinterpret_cast<char*>(bptr) + sizeof(TBlockHeader);

    context->AllocatedBytesPerDomain->Add(size);

    return ptr;
}

void* ReallocWrapper(void* ctx, void* ptr, ui64 size) {
    if (ptr == nullptr) {
        return MallocWrapper(ctx, size);
    }

    TAllocContext* context = reinterpret_cast<TAllocContext*>(ctx);
    PyMemAllocator& alloc = context->OriginalAlloc;

    // To return a distinct non-NULL pointer
    if (size == 0) {
        size = 1;
    }

    void* oldBp = reinterpret_cast<char*>(ptr) - sizeof(TBlockHeader);
    ui64 oldSize = ReadUnaligned<TBlockHeader>(oldBp).Size;

    ui64 bsize = size + sizeof(TBlockHeader);

    void* newBp = alloc.realloc(alloc.ctx, oldBp, bsize);
    Y_ENSURE(newBp != nullptr, "Failed to reallocate memory for Python interpreter");

    void* res = static_cast<char*>(newBp) + sizeof(TBlockHeader);
    WriteUnaligned<TBlockHeader>(newBp, TBlockHeader{size});

    i64 diff = static_cast<i64>(size) - oldSize;
    context->AllocatedBytesPerDomain->Add(diff);

    return res;
}

/**
 * Note that actual memory consumption by the Python interpreter is larger than what is tracked:
 * 1. Not yet used memory allocated for arenas is not tracked at all
 * 2. Python Obj allocator rounds a requested size up. Alignment is 8 bytes, so 3 becomes 8, 15->16, 497->504, ...
 */
void SetAllocator(PyMemAllocatorDomain domain, TAllocContext& ctx, NMonitoring::TMetricRegistry* registry) {
    PyMemAllocator& alloc = ctx.Allocator;
    TString metricName = "python.AllocatedBytes";

    /**
     * Types of allocators called to process a request. Information gathered from Object/obmalloc.c.
     * Source code is of CPython2 with a backport of tracemalloc. The exact version of a file that was inspected:
     * https://a.yandex-team.ru/arc/trunk/arcadia/contrib/tools/python/src/Objects/obmalloc.c?rev=5290939
     */
    switch (domain) {
        // RAW: Is never called (possibly could be though)
        case PYMEM_DOMAIN_RAW:
            ctx.AllocatedBytesPerDomain = registry->IntGauge({ {"sensor", metricName}, {"domain", "Raw"} });
            break;
        // MEM: Raw calls to system's malloc(), realloc(), free(). Is called when OBJ could not process a request:
        // 1. size > SMALL_REQUEST_THRESHOLD (https://nda.ya.ru/3VZvza)
        // 2. All arenas are currently used (https://nda.ya.ru/3VZw4D -> https://nda.ya.ru/3VZw8M)
        case PYMEM_DOMAIN_MEM:
            ctx.AllocatedBytesPerDomain = registry->IntGauge({ {"sensor", metricName}, {"domain", "Mem"} });
            break;
        // OBJ: Processes small requests. It should be tracked alongside with MEM, cause arenas for object pools are
        // allocated directly by OS allocators and cannot be tracked by MEM: https://nda.ya.ru/3VZwLj
        case PYMEM_DOMAIN_OBJ:
            ctx.AllocatedBytesPerDomain = registry->IntGauge({ {"sensor", metricName}, {"domain", "Obj"} });
            break;
    }

    alloc.ctx = &ctx;

    alloc.malloc = MallocWrapper;
    alloc.realloc = ReallocWrapper;
    alloc.free = FreeWrapper;

    PyMem_GetAllocator(domain, &ctx.OriginalAlloc);
    PyMem_SetAllocator(domain, &alloc);
}

const char INTERPRETER_NAME[] = "Solomon-Agent-Python";

} // namespace

TInitializer* TInitializer::Instance() {
    return Singleton<TInitializer>();
}

TInitializer::TInitializer()
    : Registry_{NMonitoring::TMetricRegistry::Instance()}
{
    SetAllocator(PYMEM_DOMAIN_RAW, AllocatorRaw_, Registry_);
    SetAllocator(PYMEM_DOMAIN_MEM, AllocatorMem_, Registry_);
    SetAllocator(PYMEM_DOMAIN_OBJ, AllocatorObj_, Registry_);

    Py_SetProgramName(const_cast<char*>(INTERPRETER_NAME));
    Py_InitializeEx(0); // do not install signal handlers
    FixPythonSysPath();

    PyEval_InitThreads();
    Unguard_.Reset(new TGilUnguard);
}

TInitializer::~TInitializer() {
    Unguard_.Reset();
    Py_Finalize();

    PyMem_SetAllocator(PYMEM_DOMAIN_RAW, &AllocatorRaw_.OriginalAlloc);
    PyMem_SetAllocator(PYMEM_DOMAIN_MEM, &AllocatorMem_.OriginalAlloc);
    PyMem_SetAllocator(PYMEM_DOMAIN_OBJ, &AllocatorObj_.OriginalAlloc);
}

} // namespace NPython2
