#include <util/system/types.h>
#include <util/generic/vector.h>
#include <util/random/fast.h>
#include <util/charset/utf8.h>
#include <util/generic/bt_exception.h>
#include <util/digest/murmur.h>

#include "min_hash.h"

namespace NMinHash {

    TVector<TInt> GenSeeds(size_t n) {
        TFastRng64 rng(42);
        TVector<TInt> seeds(n);
        for (TInt& seed : seeds)
            seed = static_cast<TInt>(rng.GenRand64());
        return seeds;
    }

    static inline const ui8* SkipUtf8(const ui8* b, const ui8* e, size_t n) {
        for (size_t i = 0; i < n && b < e; i++) {
            size_t charLen = 0;
            if (GetUTF8CharLen(charLen, b, e) != RECODE_OK)
                ythrow TWithBackTrace<yexception>() << "cannot recode:" << TStringBuf(reinterpret_cast<const char*>(b), reinterpret_cast<const char*>(e));
            b += charLen;
        }
        return b;
    }

    TVector<TInt> MinHash(const TStringBuf src, const TVector<TInt>& seeds) {
        TVector<TInt> hashes(seeds.size(), std::numeric_limits<TInt>::max());
        MinHash(src, seeds, hashes);
        return hashes;
    }

    void MinHash(const TStringBuf src, const TVector<TInt>& seeds, TVector<TInt>& hashes) {
        if(Y_UNLIKELY(!src))
            return;

        if(Y_UNLIKELY(seeds.size() != hashes.size())) {
            hashes = TVector<TInt>(seeds.size(), std::numeric_limits<TInt>::max());
        }

        auto b = reinterpret_cast<const ui8*>(src.begin());
        auto e = reinterpret_cast<const ui8*>(src.end());

        const ui8* fourth = SkipUtf8(b, e, 3);

        do{
            for (size_t i = 0; i < seeds.size(); i++) {
                const TInt hash = MurmurHash<TInt>(b, fourth - b, seeds[i]);
                if (hash < hashes[i]) {
                    hashes[i] = hash;
                }
            }
            b = SkipUtf8(b, e, 1);
            fourth = SkipUtf8(fourth, e, 1);
        } while (fourth < e);
    }
} // namespace NMinHash
