import yt.wrapper as yt
import sys
from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    clusters,
    Record,
    files
)


def unwrap_query_with_device(records):
    for r in records:
        parts = r.value.split('\t')

        qid = r.key
        query_text = parts[0]
        query_region_id = int(parts[1])
        query_country = parts[2]
        query_device = parts[3]

        yield Record(qid=qid,
                    query_text=query_text,
                    query_region_id=query_region_id,
                    query_country=query_country,
                    query_device=query_device)


def unwrap_url_with_page(records):
    for r in records:
        parts = r.value.split('\t', 5)

        url = parts[1]
        page_url = parts[2]
        snippet = parts[3]
        title = parts[4]
        qid = r.key

        yield Record(qid=qid, url=url, page_url=page_url, snippet=snippet, title=title)


def sample_docs_for_markup(fml_pool, job_root, token, enrichment_batch_size):
    queries_path = '{}/{}'.format(fml_pool, 'queries')

    offset_path = '{}/{}'.format(job_root, 'offset')
    enrichments_path = '{}/{}'.format(job_root, 'enrichments')
    leftovers_path = '{}/{}'.format(job_root, 'leftovers')
    docs_for_markup_path = '{}/{}'.format(job_root, 'docs_for_markup')

    print >>sys.stderr, 'batch_size', enrichment_batch_size
    print >>sys.stderr, 'queries_path', queries_path
    print >>sys.stderr, 'offset_path', offset_path
    print >>sys.stderr, 'enrichments_path', enrichments_path
    print >>sys.stderr, 'leftovers_path', leftovers_path
    print >>sys.stderr, 'docs_for_markup_path', docs_for_markup_path

    if not yt.exists(job_root):
        print >>sys.stderr, 'job root does not exist, creating'
        yt.mkdir(job_root, recursive=True)

    if yt.exists(offset_path):
        print >>sys.stderr, 'Offset file exists'
        offset_data = list(yt.read_table(offset_path))
        current_offset = int(offset_data[0]['offset'])
    else:
        print >>sys.stderr, 'Offset file does not exist, creating'
        current_offset = 0
        yt.write_table(offset_path, [{"offset": current_offset}])

    print >>sys.stderr, "Current offset", current_offset

    features_path = '{}/{}[#{}:#{}]'.format(fml_pool, 'features', current_offset, current_offset + enrichment_batch_size)
    print >>sys.stderr, 'features_path', features_path

    cluster = clusters.Hahn(token=token).env(templates=dict(job_root=job_root))

    job = cluster.job()

    features = job.table(features_path).map(unwrap_url_with_page)
    queries = job.table(queries_path).map(unwrap_query_with_device)

    join_key = ['query_text', 'query_region_id', 'query_device', 'url', 'page_url']

    joined_fml_docs = features \
                .join(queries, by='qid')

    if not yt.exists(enrichments_path):
        new_docs = joined_fml_docs
        print >>sys.stderr, "No enrichments table yet"
    else:
        print >>sys.stderr, "Enrichments table is present"
        enrichments = job.table(enrichments_path)
        if yt.exists(leftovers_path):
            print >>sys.stderr, "Leftovers table is present"
            leftovers = job.table(leftovers_path)

            new_docs = joined_fml_docs.join(enrichments,
                                            type='left_only',
                                            by=join_key)\
                                        .join(leftovers,
                                            type='left_only',
                                            by=join_key)
        else:
            print >>sys.stderr, "No leftovers table found"
            new_docs = joined_fml_docs.join(enrichments,
                                            type='left_only',
                                            by=join_key)

    new_docs.put(docs_for_markup_path)

    print >>sys.stderr, "Starting job"
    job.run()

    print >>sys.stderr, "Updating offset file with", current_offset + enrichment_batch_size
    yt.write_table(yt.TablePath(offset_path, append=False), [{"offset": current_offset + enrichment_batch_size}])

    return docs_for_markup_path


def main(*args):
    params, in2, in3, token, any_param, html_file = args

    job_root = params[0]['job_root']
    fml_pool = params[0]['fml_pool']
    enrichment_batch_size = params[0]['enrichment_batch_size']

    yt.update_config({"proxy": {"url": "hahn.yt.yandex.net"}, "token": token})

    sampled_docs_path = sample_docs_for_markup(fml_pool, job_root, token, enrichment_batch_size)

    return [{'cluster': 'hahn', 'table': sampled_docs_path}]
