// Implementation of Wu Manber's Multi-Pattern Search Algorithm
// Implemented by Ray Burkholder, ray@oneunified.net
// Copyright (2008) One Unified
// For use without restriction but one:  this copyright notice must be preserved.

#include <math.h>
#include <assert.h>

#include <exception>
#include <iostream>
#include <fstream>
#include <string>
#include <stdexcept>

#include <internal/search/wu_manber.h>

WuManber::WuManber(void)
    : m(0)
    , m_bInitialized(false)
    , m_ShiftTable(NULL)
    , m_vPatternMap(NULL)
{}

WuManber::~WuManber(void) {
    delete [] m_vPatternMap;
    delete [] m_ShiftTable;
}

void WuManber::loadPatternsFromFile(const std::string& file)
{
    std::vector<std::string> patterns;
    std::ifstream is(file.c_str());
    if (!is.good()) {
        throw std::runtime_error("WuManber: Can't load patterns from file");
    }
    while(is) {
        std::string buf;
        getline(is, buf);
        if(is) patterns.push_back(buf);
    }
    Initialize(patterns);
}


void WuManber::Initialize(const std::vector<std::string> &patterns_) {
    patterns = patterns_;
    m = 0; // start with 0 and grow from there
    for ( unsigned int i = 0; i < patterns.size(); ++i ) {
        size_t lenPattern = patterns[i].size();
        if(B > lenPattern)
            std::cerr << "found pattern less than B in length"
                      << patterns[i]  << std::endl;
        m = (0 == m) ? lenPattern : std::min(m, lenPattern);
    }

    m_nSizeOfAlphabet = 0; // at minimum we have a white space character
    for(unsigned short i = 0; i < 255; ++i) {
        m_lu[i].letter = static_cast<char>(i);
        m_lu[i].offset = m_nSizeOfAlphabet++;
    }

    m_nBitsInShift = static_cast<unsigned short>(ceil(log(m_nSizeOfAlphabet)/log(2.0)));
    // can use fewer bits in shift to turn it into a hash

    m_nTableSize = static_cast<size_t>(pow(pow(2.0, m_nBitsInShift), static_cast<int>(B)));
    // 2 ** bits ** B, will be some unused space when not hashed
    m_ShiftTable = new size_t[m_nTableSize];

    for(size_t i = 0; i < m_nTableSize; ++i) {
        m_ShiftTable[i] = m - B + 1; // default to m-B+1 for shift
    }

    m_vPatternMap = new std::vector<structPatternMap>[m_nTableSize];

    for(size_t j = 0; j < patterns.size(); ++j) {  // loop through patterns
        for(size_t q = m; q >= B; --q) {
            unsigned int hash;
            hash  = m_lu[static_cast<unsigned char>(patterns[j][q - 2 - 1])].offset; // bring in offsets of X in pattern j
            hash <<= m_nBitsInShift;
            hash += m_lu[static_cast<unsigned char>(patterns[j][q - 1 - 1]) ].offset;
            hash <<= m_nBitsInShift;
            hash += m_lu[static_cast<unsigned char>(patterns[j][q     - 1])].offset;
            size_t shiftlen = m - q;
            m_ShiftTable[hash] = std::min(m_ShiftTable[hash], shiftlen);
            if(0 == shiftlen) {
                m_PatternMapElement.ix = j;
                m_PatternMapElement.PrefixHash = m_lu[static_cast<unsigned char>(patterns[j][0])].offset;
                m_PatternMapElement.PrefixHash <<= m_nBitsInShift;
                m_PatternMapElement.PrefixHash += m_lu[static_cast<unsigned char>(patterns[j][1])].offset;
                m_vPatternMap[ hash ].push_back( m_PatternMapElement );
            }
        }
    }
    m_bInitialized = true;
}

bool WuManber::Search(const std::string& Text, std::string& matched) const
{
    if ( m > Text.size() )
        return false;
    assert(m_bInitialized);
    size_t ix = m - 1; // start off by matching end of largest common pattern
    while(ix < Text.size()) {
        unsigned int hash1;
        hash1 = m_lu[static_cast<unsigned char>(Text[ix-2])].offset;
        hash1 <<= m_nBitsInShift;
        hash1 += m_lu[static_cast<unsigned char>(Text[ix-1])].offset;
        hash1 <<= m_nBitsInShift;
        hash1 += m_lu[static_cast<unsigned char>(Text[ix])].offset;
        size_t shift = m_ShiftTable[hash1];
        if(shift > 0) {
            ix += shift;
        } else {  // we have a potential match when shift is 0
            unsigned int hash2;  // check for matching prefixes
            hash2 = m_lu[static_cast<unsigned char>(Text[ix-m+1])].offset;
            hash2 <<= m_nBitsInShift;
            hash2 += m_lu[static_cast<unsigned char>(Text[ix-m+2])].offset;
            std::vector<structPatternMap> &element = m_vPatternMap[ hash1 ];
            std::vector<structPatternMap>::iterator iter = element.begin();
            while(element.end() != iter) {
                if(hash2 == (*iter).PrefixHash) {
                    // since prefix fes, compare target substring with pattern
                    const char *ixTarget = &Text[0] + ix - m + 3; // we know first two characters already match
                    const char *ixPattern = &patterns[(*iter).ix][0] + 2;  // ditto
                    while((0 != *ixTarget) && (0 != *ixPattern)) { // match until we reach end of either string
                        if(m_lu[static_cast<unsigned char>(*ixTarget)].letter == m_lu[static_cast<unsigned char>(*ixPattern)].letter ) {  // match against chosen case sensitivity
                            ++ixTarget;
                            ++ixPattern;
                        } else {
                            break;
                        }
                    }
                    if(0 == *ixPattern) {  // we fthe end of the pattern, so match found
                        matched = patterns[(*iter).ix];
                        return true;
                    }
                }
                ++iter;
            }
            ++ix;
        }
    }
    return false;
}
