#include "static_zmap.h"

#include <irt/common/lib/logger/logger.h>

#include <rt-research/broadmatching/scripts/cpp-source/common/HuffmanTree.h>

#include <util/generic/string.h>
#include <util/stream/file.h>

#include <library/cpp/getopt/small/last_getopt.h>

DictFrequencies ComputeWordsFrequency(
    IInputStream& data_stream,
    char elements_delim,
    char fields_delim,
    char words_delim,
    bool keep_key_unencoded) {
    DictFrequencies words_frequency;

    TString element;
    while (data_stream.ReadTo(element, elements_delim)) {
        TStringBuf elementBuffer(element);
        const auto& key = elementBuffer.Before(fields_delim);
        const auto& value = elementBuffer.SplitOffOn(key.size());

        if (!keep_key_unencoded) {
            for (const auto& word : StringSplitter(key).Split(words_delim).SkipEmpty()) {
                ++words_frequency[TString(word)];
            }
        }

        for (const auto& word : StringSplitter(value).Split(words_delim).SkipEmpty()) {
            ++words_frequency[TString(word)];
        }
        ++words_frequency["\\end"];
    }

    return words_frequency;
}

std::vector<char> ApplyEscape(
    const std::vector<char>& encoded_element,
    char elements_delim,
    char fields_delim,
    char escape,
    char escaped_elements_delim,
    char escaped_fields_delim) {
  std::vector<char> result;
  for (char byte : encoded_element) {
    if (byte == elements_delim) {
      result.push_back(escape);
      result.push_back(escaped_elements_delim);
    } else if (byte == fields_delim) {
      result.push_back(escape);
      result.push_back(escaped_fields_delim);
    } else if (byte == escape) {
      result.push_back(escape);
      result.push_back(escape);
    } else {
      result.push_back(byte);
    }
  }
  return result;
}

std::vector<char> EncodeField(
    const HuffmanTree& huffman_tree,
    TString field,
    char words_delim,
    bool add_end_marker = false) {
  TStringInput field_stream(field);
  TString word;
  std::vector<std::string> words;
  while (field_stream.ReadTo(word, words_delim)) {
    if (word) {
      words.push_back(word);
    }
  }
  std::vector<char> encoded_field;
  if (!huffman_tree.Encode(words, encoded_field, add_end_marker)) {
    std::cerr << "ERROR: some errors occur during encode \"" << field << "\""
              << std::endl;
  }
  return encoded_field;
}

std::vector<char> FromString(const TString& str) {
    std::vector<char> data;
    data.reserve(str.size());
    for (const char ch: str) {
        data.push_back(ch);
    }
    return data;
}

void EncodeData(
    const HuffmanTree& huffman_tree,
    IInputStream& src_stream,
    char src_elements_delim,
    char src_fields_delim,
    char src_words_delim,
    std::ostream& dst_stream,
    char dst_elements_delim,
    char dst_fields_delim,
    char dst_escape,
    char dst_escaped_elements_delim,
    char dst_escaped_fields_delim,
    bool keep_key_unencoded) {
  TString element;
  while (src_stream.ReadTo(element, src_elements_delim)) {
    TStringInput element_stream(element);
    TString key;
    element_stream.ReadTo(key, src_fields_delim);
    TString value = element_stream.ReadAll();
    std::vector<char> encoded_key =
        keep_key_unencoded ?
        FromString(key) :
        EncodeField(huffman_tree, key, src_words_delim);
    encoded_key = ApplyEscape(encoded_key,
                              dst_elements_delim,
                              dst_fields_delim,
                              dst_escape,
                              dst_escaped_elements_delim,
                              dst_escaped_fields_delim);
    std::vector<char> encoded_value = EncodeField(huffman_tree,
                                                  value,
                                                  src_words_delim,
                                                  true);
    encoded_value = ApplyEscape(encoded_value,
                                dst_elements_delim,
                                dst_fields_delim,
                                dst_escape,
                                dst_escaped_elements_delim,
                                dst_escaped_fields_delim);
    dst_stream.write(encoded_key.data(),
                     encoded_key.size() * sizeof(*encoded_key.data()));
    dst_stream.write(&dst_fields_delim, sizeof(dst_fields_delim));
    dst_stream.write(encoded_value.data(),
                     encoded_value.size() * sizeof(*encoded_value.data()));
    dst_stream.write(&dst_elements_delim, sizeof(dst_elements_delim));
  }
}

int main(int argc, char** argv) {
  NIRT::InitLog<NIRT::TTabLoggerFormatter>({.ConsoleOnly=true});

  TString src_file;
  TString dst_file;

  TString src_elements_delim;
  TString src_fields_delim;
  TString src_words_delim;

  TString dst_elements_delim;
  TString dst_fields_delim;
  TString dst_escape;
  TString dst_escaped_elements_delim;
  TString dst_escaped_fields_delim;

  TString huffman_tree_file;
  TString header_file;

  bool keep_key_unencoded = false;

  NLastGetopt::TOpts opts;
  opts.AddLongOption("src-file").Required().StoreResult(&src_file);
  opts.AddLongOption("src-ed").Optional().DefaultValue("\n").StoreResult(&src_elements_delim);
  opts.AddLongOption("src-fd").Optional().DefaultValue("\t").StoreResult(&src_fields_delim);
  opts.AddLongOption("src-wd").Optional().DefaultValue("/").StoreResult(&src_words_delim);

  opts.AddLongOption("dst-file").Required().StoreResult(&dst_file);
  opts.AddLongOption("dst-ed").Optional().DefaultValue("\n").StoreResult(&dst_elements_delim);
  opts.AddLongOption("dst-fd").Optional().DefaultValue("\t").StoreResult(&dst_fields_delim);
  opts.AddLongOption("dst-escape").Optional().DefaultValue("\\").StoreResult(&dst_escape);
  opts.AddLongOption("dst-escaped-ed").Optional().DefaultValue("0").StoreResult(&dst_escaped_elements_delim);
  opts.AddLongOption("dst-escaped-fd").Optional().DefaultValue("1").StoreResult(&dst_escaped_fields_delim);

  opts.AddLongOption("huffman-tree-file").Required().StoreResult(&huffman_tree_file);
  opts.AddLongOption("header-file").Required().StoreResult(&header_file);

  opts.AddLongOption("keep-key-unencoded").Optional().StoreTrue(&keep_key_unencoded);

  NLastGetopt::TOptsParseResult (&opts, argc, argv);

  try {
    TFileInput src_stream(src_file);
    std::ofstream dst_stream(dst_file.data(), std::ofstream::binary);
    if (dst_stream.fail()) {
        std::cerr << "ERROR: can't write file \"" << dst_file << "\"" << std::endl;
        return 0;
    }
    std::ofstream header_stream(header_file.data(), std::ofstream::binary);
    if (header_stream.fail()) {
        std::cerr << "ERROR: can't write file \"" << header_file << "\""
                  << std::endl;
        return 0;
    }
    StaticZmap::Header header("StaticZmapData", 0);
    header_stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
    header_stream.close();
    auto words_frequency = ComputeWordsFrequency(src_stream,
                                                 src_elements_delim[0],
                                                 src_fields_delim[0],
                                                 src_words_delim[0],
                                                 keep_key_unencoded);
    HuffmanTree huffman_tree;
    huffman_tree.Generate(words_frequency);
    huffman_tree.Save(huffman_tree_file.data());
    TFileInput src_encode_stream(src_file);
    EncodeData(huffman_tree, src_encode_stream, src_elements_delim[0], src_fields_delim[0],
               src_words_delim[0], dst_stream, dst_elements_delim[0], dst_fields_delim[0],
               dst_escape[0], dst_escaped_elements_delim[0], dst_escaped_fields_delim[0],
               keep_key_unencoded);
    dst_stream.close();
  } catch (TIoException& exception) {
    Cerr << "ERROR: can't read file \"" << src_file << "\"" << Endl;
  }
  return 0;
}
