import multiprocessing as mp
from functools import partial
from customer_service.ml.lib.data.knowledges import (
    convert_knowledge_to_categories,
    parse_snapshot
)
from customer_service.ml.lib.data.utils import (
    download_knowledges_snapshot_http,
    load_data
)


def match(supertag, product_tag, knowledge):
    categories = convert_knowledge_to_categories(supertag, product_tag)
    snapshot_knowledge = '/'.join(categories)
    return knowledge == snapshot_knowledge


def match_knowledge_text(snapshot, product_tag, knowledge):
    for item in snapshot:
        if 'supertag' not in item or 'body' not in item:
            continue
        if match(item['supertag'], product_tag, knowledge):
            return item['body']

    return ""


def generate_query_doc_data(knowledges_url, product_tag, data_path, cluster='hahn'):
    snapshot = download_knowledges_snapshot_http(knowledges_url)
    snapshot = parse_snapshot(snapshot, product_tag)
    df = load_data(data_path, cluster)

    match_knowledge_text_part = partial(match_knowledge_text, snapshot, product_tag)

    with mp.Pool(mp.cpu_count()) as pool:
        df['doc'] = pool.map(match_knowledge_text_part, df['target'])

    df['query'] = df['text']
    keep_cols = ['knowledgeId', 'knowledgeTitle', 'messageTime', 'target', 'query', 'doc']
    df = df[keep_cols]

    return df
